diff --git a/0001-openblas-0.2.15-system_lapack.patch b/0001-openblas-0.2.15-system_lapack.patch new file mode 100644 index 0000000000000000000000000000000000000000..4b843a9a4c853a9255c78eb38dc4713f89edd705 --- /dev/null +++ b/0001-openblas-0.2.15-system_lapack.patch @@ -0,0 +1,87 @@ +diff -up OpenBLAS-0.2.15/Makefile.system_lapack OpenBLAS-0.2.15/Makefile +--- OpenBLAS-0.2.15/Makefile.system_lapack 2015-10-27 13:44:50.000000000 -0700 ++++ OpenBLAS-0.2.15/Makefile 2015-10-28 09:14:56.696685503 -0700 +@@ -16,11 +16,7 @@ BLASDIRS += reference + endif + + SUBDIRS = $(BLASDIRS) +-ifneq ($(NO_LAPACK), 1) +-SUBDIRS += lapack +-endif +- +-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) ++SUBDIRS += lapack + + SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench + +@@ -211,57 +207,8 @@ hpl_p : + fi; \ + done + +-ifeq ($(NO_LAPACK), 1) + netlib : +- +-else +-netlib : lapack_prebuild +-ifndef NOFORTRAN +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib +-endif +-ifndef NO_LAPACKE +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib +-endif +-endif +- +-prof_lapack : lapack_prebuild +- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof +- +-lapack_prebuild : +-ifndef NOFORTRAN +- -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +-ifeq ($(FC), gfortran) +- -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc +-ifdef SMP +- -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc +-else +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +-else +- -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc +- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +-endif +- -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc +-endif ++ @$(MAKE) -C $(NETLIB_LAPACK_DIR) + + large.tgz : + ifndef NOFORTRAN +diff -up OpenBLAS-0.2.15/Makefile.system.system_lapack OpenBLAS-0.2.15/Makefile.system +--- OpenBLAS-0.2.15/Makefile.system.system_lapack 2015-10-27 13:44:50.000000000 -0700 ++++ OpenBLAS-0.2.15/Makefile.system 2015-10-28 09:14:39.994350500 -0700 +@@ -9,7 +9,7 @@ ifndef TOPDIR + TOPDIR = . + endif + +-NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib ++NETLIB_LAPACK_DIR = $(TOPDIR)/netliblapack + + # Default C compiler + # - Only set if not specified on the command line or inherited from the environment. diff --git a/0002-openblas-0.2.5-libname.patch b/0002-openblas-0.2.5-libname.patch new file mode 100644 index 0000000000000000000000000000000000000000..e30ab8bf91ad0dfb05572b11cb21680483cce0d4 --- /dev/null +++ b/0002-openblas-0.2.5-libname.patch @@ -0,0 +1,24 @@ +diff -up OpenBLAS-0.2.5/Makefile.system.orig OpenBLAS-0.2.5/Makefile.system +--- OpenBLAS-0.2.5/Makefile.system.orig 2012-11-27 01:24:53.000000000 +0200 ++++ OpenBLAS-0.2.5/Makefile.system 2012-12-24 16:13:57.316689688 +0200 +@@ -758,16 +758,16 @@ ifndef SMP + LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) + LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) + else +-LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) ++LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) ++LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) + endif + else + ifndef SMP + LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) + LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) + else +-LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) +-LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) ++LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) ++LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) + endif + endif + diff --git a/0003-openblas-0.3.11-tests.patch b/0003-openblas-0.3.11-tests.patch new file mode 100644 index 0000000000000000000000000000000000000000..abbdf45f4aeaf952f7fb1707a8104a4892023963 --- /dev/null +++ b/0003-openblas-0.3.11-tests.patch @@ -0,0 +1,26 @@ +diff -up OpenBLAS-0.3.21/Makefile.fixtests OpenBLAS-0.3.21/Makefile +--- OpenBLAS-0.3.21/Makefile.fixtests 2022-08-26 07:37:06.257272957 +0200 ++++ OpenBLAS-0.3.21/Makefile 2022-08-26 07:37:53.168414307 +0200 +@@ -147,18 +147,18 @@ tests : + ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + touch $(LIBNAME) + ifndef NO_FBLAS +- $(MAKE) -C test all ++ $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + endif + ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C utest all ++ $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + ifneq ($(NO_CBLAS), 1) + ifneq ($(ONLY_CBLAS), 1) +- $(MAKE) -C ctest all ++ $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + ifeq ($(CPP_THREAD_SAFETY_TEST), 1) +- $(MAKE) -C cpp_thread_test all ++ $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all + endif + endif + diff --git a/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch b/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch new file mode 100644 index 0000000000000000000000000000000000000000..a13979238ba58fe187acae043f454045975ecf54 --- /dev/null +++ b/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch @@ -0,0 +1,18231 @@ +From 642128b0e5f86a5bbb304350ff4826028ccd2e20 Mon Sep 17 00:00:00 2001 +From: gxw +Date: Fri, 11 Aug 2023 10:11:51 +0800 +Subject: [PATCH] OpenBLAS-0.3.23: Add opt for LoongArch64 + +--- + .github/workflows/loongarch64.yml | 110 + + Makefile.system | 10 +- + c_check | 35 + + c_check.pl | 45 + + common_loongarch64.h | 13 + + cpuid_loongarch64.c | 18 +- + kernel/loongarch64/KERNEL.LOONGSON3R5 | 31 +- + kernel/loongarch64/KERNEL.generic | 4 + + kernel/loongarch64/dgemm_kernel_16x4.S | 4058 +++++++---------- + kernel/loongarch64/dgemv_n_8_lasx.S | 554 +++ + kernel/loongarch64/dgemv_t_8_lasx.S | 481 ++ + .../loongarch64/dtrsm_kernel_LN_16x4_lasx.S | 1366 ++++++ + .../loongarch64/dtrsm_kernel_LT_16x4_lasx.S | 959 ++++ + .../loongarch64/dtrsm_kernel_RN_16x4_lasx.S | 882 ++++ + .../loongarch64/dtrsm_kernel_RT_16x4_lasx.S | 953 ++++ + kernel/loongarch64/dtrsm_kernel_macro.S | 2147 +++++++++ + kernel/loongarch64/loongarch64_asm.S | 430 ++ + kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 2348 ++++++++++ + kernel/loongarch64/sgemm_ncopy_16_lasx.S | 463 ++ + kernel/loongarch64/sgemm_ncopy_8_lasx.S | 298 ++ + kernel/loongarch64/sgemm_tcopy_16_lasx.S | 526 +++ + kernel/loongarch64/sgemm_tcopy_8_lasx.S | 406 ++ + kernel/loongarch64/sgemv_n_8_lasx.S | 463 ++ + kernel/loongarch64/sgemv_t_8_lasx.S | 405 ++ + lapack/laswp/loongarch64/Makefile | 5 + + param.h | 18 +- + 26 files changed, 14611 insertions(+), 2417 deletions(-) + create mode 100644 .github/workflows/loongarch64.yml + create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S + create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S + create mode 100644 kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S + create mode 100644 kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S + create mode 100644 kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S + create mode 100644 kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S + create mode 100644 kernel/loongarch64/dtrsm_kernel_macro.S + create mode 100644 kernel/loongarch64/loongarch64_asm.S + create mode 100644 kernel/loongarch64/sgemm_kernel_16x8_lasx.S + create mode 100644 kernel/loongarch64/sgemm_ncopy_16_lasx.S + create mode 100644 kernel/loongarch64/sgemm_ncopy_8_lasx.S + create mode 100644 kernel/loongarch64/sgemm_tcopy_16_lasx.S + create mode 100644 kernel/loongarch64/sgemm_tcopy_8_lasx.S + create mode 100644 kernel/loongarch64/sgemv_n_8_lasx.S + create mode 100644 kernel/loongarch64/sgemv_t_8_lasx.S + +diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml +new file mode 100644 +index 000000000..5501e98e0 +--- /dev/null ++++ b/.github/workflows/loongarch64.yml +@@ -0,0 +1,110 @@ ++name: loongarch64 qemu test ++ ++on: [push, pull_request] ++ ++jobs: ++ TEST: ++ runs-on: ubuntu-latest ++ strategy: ++ fail-fast: false ++ matrix: ++ include: ++ - target: LOONGSONGENERIC ++ triple: loongarch64-unknown-linux-gnu ++ opts: NO_SHARED=1 TARGET=LOONGSONGENERIC ++ - target: LOONGSON3R5 ++ triple: loongarch64-unknown-linux-gnu ++ opts: NO_SHARED=1 TARGET=LOONGSON3R5 ++ - target: LOONGSON2K1000 ++ triple: loongarch64-unknown-linux-gnu ++ opts: NO_SHARED=1 TARGET=LOONGSON2K1000 ++ ++ steps: ++ - name: Checkout repository ++ uses: actions/checkout@v3 ++ ++ - name: Install APT deps ++ run: | ++ sudo add-apt-repository ppa:savoury1/virtualisation ++ sudo apt-get update ++ sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ ++ qemu-user-static ++ ++ - name: Download and install loongarch64-toolchain ++ run: | ++ wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz ++ tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt ++ ++ - name: Set env ++ run: | ++ echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV ++ echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV ++ ++ - name: Compilation cache ++ uses: actions/cache@v3 ++ with: ++ path: ~/.ccache ++ key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} ++ restore-keys: | ++ ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} ++ ccache-${{ runner.os }}-${{ matrix.target }} ++ ++ - name: Configure ccache ++ run: | ++ test -d ~/.ccache || mkdir -p ~/.ccache ++ echo "max_size = 300M" > ~/.ccache/ccache.conf ++ echo "compression = true" >> ~/.ccache/ccache.conf ++ ccache -s ++ ++ - name: Disable utest dsdot:dsdot_n_1 ++ run: | ++ echo -n > utest/test_dsdot.c ++ echo "Due to the qemu versions 7.2 causing utest cases to fail," ++ echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." ++ ++ - name: Build OpenBLAS ++ run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) ++ ++ - name: Test ++ run: | ++ qemu-loongarch64-static ./utest/openblas_utest ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 ++ rm -f ./test/?BLAT2.SUMM ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat ++ rm -f ./test/?BLAT2.SUMM ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat ++ rm -f ./test/?BLAT3.SUMM ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat ++ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat ++ rm -f ./test/?BLAT3.SUMM ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat ++ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat +diff --git a/Makefile.system b/Makefile.system +index 343b94bb3..1eabff27d 100644 +--- a/Makefile.system ++++ b/Makefile.system +@@ -932,8 +932,12 @@ BINARY_DEFINED = 1 + endif + + ifeq ($(ARCH), loongarch64) +-CCOMMON_OPT += -march=loongarch64 -mabi=lp64 +-FCOMMON_OPT += -march=loongarch64 -mabi=lp64 ++LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) ++ifneq ($(LA64_ABI), lp64d) ++LA64_ABI=lp64 ++endif ++CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) ++FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) + endif + + endif +@@ -1763,6 +1767,8 @@ export TARGET_CORE + export NO_AVX512 + export NO_AVX2 + export BUILD_BFLOAT16 ++export NO_LSX ++export NO_LASX + + export SBGEMM_UNROLL_M + export SBGEMM_UNROLL_N +diff --git a/c_check b/c_check +index e8f90e18a..5a7163a63 100755 +--- a/c_check ++++ b/c_check +@@ -181,6 +181,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then + rm -rf "$tmpd" + fi + ++no_lsx=0 ++no_lasx=0 ++if [ "$architecture" = "loongarch64" ]; then ++ tmpd="$(mktemp -d)" ++ tmplsx="$tmpd/lsx.c" ++ codelsx='"vadd.b $vr0, $vr0, $vr0"' ++ lsx_flags='-march=loongarch64 -mlsx' ++ printf "#include \n\n" >> "$tmplsx" ++ printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" ++ args="$lsx_flags -o $tmplsx.o $tmplsx" ++ { ++ $compiler_name $flags $args >/dev/null 2>&1 ++ } || { ++ no_lsx=1 ++ } ++ ++ tmplasx="$tmpd/lasx.c" ++ codelasx='"xvadd.b $xr0, $xr0, $xr0"' ++ lasx_flags='-march=loongarch64 -mlasx' ++ printf "#include \n\n" >> "$tmplasx" ++ printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" ++ args="$lasx_flags -o $tmplasx.o $tmplasx" ++ { ++ $compiler_name $flags $args >/dev/null 2>&1 ++ } || { ++ no_lasx=1 ++ } ++ ++ rm -rf "$tmpd" ++fi ++ + case "$data" in + *ARCH_X86_64*) architecture=x86_64 ;; + *ARCH_X86*) architecture=x86 ;; +@@ -395,6 +426,8 @@ done + [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" + [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" ++ [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" ++ [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" + } >> "$makefile" + + os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` +@@ -410,6 +443,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` + [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" + [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" + [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" ++ [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" ++ [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" + } >> "$config" + + +diff --git a/c_check.pl b/c_check.pl +index 6ce28e11b..7a860a211 100644 +--- a/c_check.pl ++++ b/c_check.pl +@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { + } + } + ++$no_lsx = 0; ++$no_lasx = 0; ++if (($architecture eq "loongarch64")) { ++ eval "use File::Temp qw(tempfile)"; ++ if ($@){ ++ warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; ++ } else { ++ $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); ++ $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; ++ $lsx_flags = "-march=loongarch64 -mlsx"; ++ print $tmplsx "#include \n\n"; ++ print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; ++ ++ $args = "$lsx_flags -o $tmplsx.o $tmplsx"; ++ my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); ++ system(@cmd) == 0; ++ if ($? != 0) { ++ $no_lsx = 1; ++ } else { ++ $no_lsx = 0; ++ } ++ unlink("$tmplsx.o"); ++ ++ $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); ++ $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; ++ $lasx_flags = "-march=loongarch64 -mlasx"; ++ print $tmplasx "#include \n\n"; ++ print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; ++ ++ $args = "$lasx_flags -o $tmplasx.o $tmplasx"; ++ my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); ++ system(@cmd) == 0; ++ if ($? != 0) { ++ $no_lasx = 1; ++ } else { ++ $no_lasx = 0; ++ } ++ unlink("$tmplasx.o"); ++ } ++} ++ + $architecture = x86 if ($data =~ /ARCH_X86/); + $architecture = x86_64 if ($data =~ /ARCH_X86_64/); + $architecture = e2k if ($data =~ /ARCH_E2K/); +@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; + print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; + print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; + print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; ++print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; ++print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; + + $os =~ tr/[a-z]/[A-Z]/; + $architecture =~ tr/[a-z]/[A-Z]/; +@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; + print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; + print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; + print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; ++print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; ++print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; + + + if ($os eq "LINUX") { +diff --git a/common_loongarch64.h b/common_loongarch64.h +index e15539b5f..ce1fcf091 100644 +--- a/common_loongarch64.h ++++ b/common_loongarch64.h +@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; + } + ++#ifndef NO_AFFINITY ++static inline int WhereAmI(void){ ++ int ret = 0, counter = 0; ++ __asm__ volatile ( ++ "rdtimel.w %[counter], %[id]" ++ : [id]"=r"(ret), [counter]"=r"(counter) ++ : ++ : "memory" ++ ); ++ return ret; ++} ++#endif ++ + #ifdef DOUBLE + #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") + #else +diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c +index ca07c7ffb..7c389db27 100644 +--- a/cpuid_loongarch64.c ++++ b/cpuid_loongarch64.c +@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************/ + + #include ++#include + + /* If LASX extension instructions supported, + * using core LOONGSON3R5 +@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define CPU_LOONGSON3R5 1 + #define CPU_LOONGSON2K1000 2 + +-#define LOONGARCH_CFG2 0x02 +-#define LOONGARCH_LASX 1<<7 +-#define LOONGARCH_LSX 1<<6 ++#define LA_HWCAP_LSX (1<<4) ++#define LA_HWCAP_LASX (1<<5) + + static char *cpuname[] = { + "LOONGSONGENERIC", +@@ -64,17 +64,11 @@ static char *cpuname_lower[] = { + + int detect(void) { + #ifdef __linux +- uint32_t reg = 0; ++ int flag = (int)getauxval(AT_HWCAP); + +- __asm__ volatile ( +- "cpucfg %0, %1 \n\t" +- : "+&r"(reg) +- : "r"(LOONGARCH_CFG2) +- ); +- +- if (reg & LOONGARCH_LASX) ++ if (flag & LA_HWCAP_LASX) + return CPU_LOONGSON3R5; +- else if (reg & LOONGARCH_LSX) ++ else if (flag & LA_HWCAP_LSX) + return CPU_LOONGSON2K1000; + else + return CPU_GENERIC; +diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 +index cda359040..011e8b89e 100644 +--- a/kernel/loongarch64/KERNEL.LOONGSON3R5 ++++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 +@@ -1,3 +1,4 @@ ++ifndef NO_LASX + DGEMMKERNEL = dgemm_kernel_16x4.S + DGEMMINCOPY = dgemm_ncopy_16.S + DGEMMITCOPY = dgemm_tcopy_16.S +@@ -8,7 +9,29 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) + DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) + DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ++DGEMVNKERNEL = dgemv_n_8_lasx.S ++DGEMVTKERNEL = dgemv_t_8_lasx.S ++ ++SGEMMKERNEL = sgemm_kernel_16x8_lasx.S ++SGEMMINCOPY = sgemm_ncopy_16_lasx.S ++SGEMMITCOPY = sgemm_tcopy_16_lasx.S ++SGEMMONCOPY = sgemm_ncopy_8_lasx.S ++SGEMMOTCOPY = sgemm_tcopy_8_lasx.S ++SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) ++SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) ++SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) ++SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ++ ++SGEMVNKERNEL = sgemv_n_8_lasx.S ++SGEMVTKERNEL = sgemv_t_8_lasx.S ++ ++DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S ++DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S ++DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S ++DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S ++endif ++ ++STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ++STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ++STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ++STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic +index b772a6f82..213add9ee 100644 +--- a/kernel/loongarch64/KERNEL.generic ++++ b/kernel/loongarch64/KERNEL.generic +@@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c + ZSWAPKERNEL = ../arm/zswap.c + + SGEMVNKERNEL = ../arm/gemv_n.c ++ifndef DGEMVNKERNEL + DGEMVNKERNEL = ../arm/gemv_n.c ++endif + CGEMVNKERNEL = ../arm/zgemv_n.c + ZGEMVNKERNEL = ../arm/zgemv_n.c + + SGEMVTKERNEL = ../arm/gemv_t.c ++ifndef DGEMVTKERNEL + DGEMVTKERNEL = ../arm/gemv_t.c ++endif + CGEMVTKERNEL = ../arm/zgemv_t.c + ZGEMVTKERNEL = ../arm/zgemv_t.c + +diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S +index 13faa977e..f8e26fda2 100644 +--- a/kernel/loongarch64/dgemm_kernel_16x4.S ++++ b/kernel/loongarch64/dgemm_kernel_16x4.S +@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #include "common.h" + ++/********************************************************************* ++* 2023/06/28 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++* 2023/06/28 guxiwei ++* Parameter: ++* DGEMM_DEFAULT_UNROLL_N 4 ++* DGEMM_DEFAULT_UNROLL_M 16 ++* DGEMM_DEFAULT_P 32 ++* DGEMM_DEFAULT_Q 152 ++* DGEMM_DEFAULT_R 858 ++* A_PR1 1024 ++* B_PR1 256 ++* ++* ++* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: ++* 1 thread: 36.0 GFLOPS ++* 2 threads: 71.6 GFLOPS ++* 3 threads: 101.5 GFLOPS ++* 4 threads: 132.8 GFLOPS ++*********************************************************************/ ++ + /* Function parameters */ + #define M $r4 // param 1: bm + #define N $r5 // param 2: bn +@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define U4 $xr4 + #define U5 $xr5 + #define U6 $xr6 +-#define D0 $xr7 +-#define D1 $xr8 +-#define D2 $xr9 +-#define D3 $xr10 +-#define D4 $xr11 +-#define D5 $xr12 +-#define D6 $xr13 +-#define D7 $xr14 +-#define D8 $xr15 +-#define D9 $xr16 +-#define D10 $xr17 +-#define D11 $xr18 +-#define D12 $xr19 +-#define D13 $xr20 +-#define D14 $xr21 +-#define D15 $xr22 +-#define VALPHA $xr23 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++#define VALPHA $xr15 + + /* Prefetch interval */ +-#define A_PRE 0x200 ++#define A_PRE 0x400 + #define B_PRE 0x100 + +- PROLOGUE +- +- addi.d $sp, $sp, -56 +- /* Store regs */ +- SDARG $r23, $sp, 0 +- SDARG $r24, $sp, 8 +- SDARG $r25, $sp, 16 +- SDARG $r26, $sp, 24 +- SDARG $r27, $sp, 32 +- ST $f23, $sp, 40 +- ST ALPHA, $sp, 48 +- +- /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ +- xvld VALPHA, $sp, 48 +- xvreplve0.d VALPHA, VALPHA +- +-#if defined (TRMMKERNEL) && !defined(LEFT) +- sub.d OFF, ZERO, OFFSET +-#else +- xor OFF, OFF, OFF +-#endif +- +- /* if (!(N >> 2)) goto L_N3 */ +- srai.d J, N, 2 /* J = bn >> 2 */ +- andi N, N, 0x03 +- beq ZERO, J, .L_N3 +- +-.L_J1: /* J-- && This loop include Condition 1 */ +- +-/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +-* dgemm_core_16x4 */ +- move C0, C +- move A0, A +- slli.d T0, LDC, 3 +- add.d C1, C0, T0 +- addi.d J, J, -1 /* J-- */ +- add.d C2, C1, T0 +- add.d C3, C2, T0 +- +-#if defined(TRMMKERNEL) && defined(LEFT) +- move OFF, OFFSET +-#endif +- +- /* if (!(M >> 4)) goto L_M8 */ +- srai.d I, M, 4 /* I = bm >> 4 */ +- beq ZERO, I, .L_M8 +- +-.L_I1: /* I-- */ +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- move B0, B +-#else +- slli.d T0, OFF, 0x07 +- add.d A0, A0, T0 +- slli.d T0, OFF, 0x05 +- add.d B0, B, T0 +-#endif +- +-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +- sub.d L, K, OFF +-#elif defined(LEFT) +- /* number of values in A */ +- addi.d L, OFF, 16 +-#else +- /* number of values in B */ +- addi.d L, OFF, 4 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif +- /* Calculate the first set of D0~D15, +- * avoidig set 0 operation +- * Load 16 * 64 from A0 +- * U0 = {a3, a2, a1, a0} +- * U1 = {a7, a6, a5, a4} +- * U2 = {a11, a10, a9, a8} +- * U3 = {a15, a14, a13, a12} +- */ ++.macro KERNEL2x16x4 + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- preld 0, C0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- xvfmul.d D1, U1, U4 +- preld 0, C0, 0x40 +- xvfmul.d D2, U2, U4 +- xvfmul.d D3, U3, U4 +- +- xvldrepl.d U4, B0, 0x08 +- preld 0, C1, 0x00 +- /* line 2 */ +- xvfmul.d D4, U0, U4 +- xvfmul.d D5, U1, U4 +- preld 0, C1, 0x40 +- xvfmul.d D6, U2, U4 +- xvfmul.d D7, U3, U4 +- +- xvldrepl.d U4, B0, 0x10 +- preld 0, C2, 0x00 +- /* line 3 */ +- xvfmul.d D8, U0, U4 +- xvfmul.d D9, U1, U4 +- preld 0, C2, 0x40 +- xvfmul.d D10, U2, U4 +- xvfmul.d D11, U3, U4 +- +- xvldrepl.d U4, B0, 0x18 +- preld 0, C3, 0x00 +- /* line 4 */ +- xvfmul.d D12, U0, U4 +- xvfmul.d D13, U1, U4 +- preld 0, C3, 0x40 +- xvfmul.d D14, U2, U4 +- xvfmul.d D15, U3, U4 +- +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x20 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_L7 */ +- beq ZERO,TL, .L_L7 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 + +- /* Calculate 8 sets of D0~D15 */ +-.L_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 + xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ + xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ + xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ + preld 0, B0, B_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D10, U10, U14, D10 ++ xvfmadd.d D11, U11, U14, D11 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D14, U10, U15, D14 ++ xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + +- /***8-2***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- /* Cumulative D0~D15 */ +- xvldrepl.d U4, B0, 0x00 ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 ++ ++ xvld U10, A0, 0x40 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvld U11, A0, 0x60 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ + preld 0, B0, B_PRE ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 ++.endm + +- /***8-3***/ +- /* Load 16 * 64 from A0 */ ++.macro KERNEL2x16x4_END + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ + xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ + xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ + preld 0, B0, B_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D10, U10, U14, D10 ++ xvfmadd.d D11, U11, U14, D11 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D14, U10, U15, D14 ++ xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + +- /***8-4***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- /* Cumulative D0~D15 */ +- xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 ++ + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 ++ ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ + preld 0, B0, B_PRE ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++.macro KERNEL8x16x4 ++.rept 4 ++ KERNEL2x16x4 ++.endr ++.endm + +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x16x4_END ++.rept 3 ++ KERNEL2x16x4 ++.endr ++ KERNEL2x16x4_END ++.endm + +- /***8-5***/ +- /* Load 16 * 64 from A0 */ ++.macro KERNEL2x8x4 + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- preld 0, B0, B_PRE ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- preld 0, A0, A_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 +- preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 + +- addi.d A0, A0, 0x80 ++ addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + +- /***8-6***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 + +- /* Cumulative D0~D15 */ +- xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- preld 0, B0, B_PRE + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- preld 0, A0, A_PRE ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 +- preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 + +- addi.d A0, A0, 0x80 ++ addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 ++.endm + +- /***8-7***/ +- /* Load 16 * 64 from A0 */ ++.macro KERNEL2x8x4_END + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- preld 0, B0, B_PRE + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- preld 0, A0, A_PRE ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 +- preld 0, A0, A_PRE + 0x40 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++.endm + +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x8x4 ++.rept 4 ++ KERNEL2x8x4 ++.endr ++.endm + +- /***8-8***/ +- /* Load 16 * 64 from A0 */ ++.macro KERNEL8x8x4_END ++.rept 3 ++ KERNEL2x8x4 ++.endr ++ KERNEL2x8x4_END ++.endm ++ ++.macro KERNEL2x4x4 + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- preld 0, B0, B_PRE ++ xvfmadd.d D0, U8, U12, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- preld 0, A0, A_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 +- preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 + +- addi.d A0, A0, 0x80 ++ addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + +- addi.d TL, TL, -1 /* TL-- */ +- blt ZERO,TL, .L_TL1 ++ xvld U8, A0, 0x00 + +- /* Maybe we need calculate the last +- * 7 sets of D0~D15? +- */ +-.L_L7: +- /* if (!(L & 7)) goto L_L0 */ +- andi TL, L, 7 +- beq TL, ZERO,.L_L0 ++ xvldrepl.d U12, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 + +-.L_L71: +- /* Load 16 * 64 from A0 */ ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x4x4_END + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 + +- /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 ++ xvfmadd.d D0, U8, U12, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 +- xvfmadd.d D10, U2, U4, D10 +- xvfmadd.d D11, U3, U4, D11 ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 +- xvfmadd.d D14, U2, U4, D14 +- xvfmadd.d D15, U3, U4, D15 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 + +- /* Add stride for A0, B0 */ +- addi.d A0, A0, 0x80 ++ addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + +- addi.d TL, TL, -1 +- blt ZERO,TL, .L_L71 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D12, U0, U7, D12 ++.endm + +-.L_L0: +-#if defined(TRMMKERNEL) +- xvfmul.d D0, D0, VALPHA +- xvfmul.d D1, D1, VALPHA +- xvfmul.d D2, D2, VALPHA +- xvfmul.d D3, D3, VALPHA +- xvfmul.d D4, D4, VALPHA +- xvfmul.d D5, D5, VALPHA +- xvfmul.d D6, D6, VALPHA +- xvfmul.d D7, D7, VALPHA +- xvfmul.d D8, D8, VALPHA +- xvfmul.d D9, D9, VALPHA +- xvfmul.d D10, D10, VALPHA +- xvfmul.d D11, D11, VALPHA +- xvfmul.d D12, D12, VALPHA +- xvfmul.d D13, D13, VALPHA +- xvfmul.d D14, D14, VALPHA +- xvfmul.d D15, D15, VALPHA +-#else +- /* Load C0 */ +- xvld U0, C0, 0x00 +- xvld U1, C0, 0x20 +- xvld U2, C0, 0x40 +- xvld U3, C0, 0x60 +- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +- xvfmadd.d D1, D1, VALPHA, U1 +- xvfmadd.d D2, D2, VALPHA, U2 +- xvfmadd.d D3, D3, VALPHA, U3 ++.macro KERNEL8x4x4 ++.rept 4 ++ KERNEL2x4x4 ++.endr ++.endm + +- /* Load C1 */ +- xvld U0, C1, 0x00 +- xvld U1, C1, 0x20 +- xvld U2, C1, 0x40 +- xvld U3, C1, 0x60 +- xvfmadd.d D4, D4, VALPHA, U0 +- xvfmadd.d D5, D5, VALPHA, U1 +- xvfmadd.d D6, D6, VALPHA, U2 +- xvfmadd.d D7, D7, VALPHA, U3 ++.macro KERNEL8x4x4_END ++.rept 3 ++ KERNEL2x4x4 ++.endr ++ KERNEL2x4x4_END ++.endm + +- /* Load C2 */ +- xvld U0, C2, 0x00 +- xvld U1, C2, 0x20 +- xvld U2, C2, 0x40 +- xvld U3, C2, 0x60 +- xvfmadd.d D8, D8, VALPHA, U0 +- xvfmadd.d D9, D9, VALPHA, U1 +- xvfmadd.d D10, D10, VALPHA, U2 +- xvfmadd.d D11, D11, VALPHA, U3 ++.macro KERNEL2x2x4 ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 + +- /* Load C3 */ +- xvld U0, C3, 0x00 +- xvld U1, C3, 0x20 +- xvld U2, C3, 0x40 +- xvld U3, C3, 0x60 +- xvfmadd.d D12, D12, VALPHA, U0 +- xvfmadd.d D13, D13, VALPHA, U1 +- xvfmadd.d D14, D14, VALPHA, U2 +- xvfmadd.d D15, D15, VALPHA, U3 +-#endif // #if defined(TRMMKERNEL) ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 + +- /* Store C0 */ +- xvst D0, C0, 0x00 +- xvst D1, C0, 0x20 +- xvst D2, C0, 0x40 +- xvst D3, C0, 0x60 +- /* Store C1 */ +- xvst D4, C1, 0x00 +- xvst D5, C1, 0x20 +- xvst D6, C1, 0x40 +- xvst D7, C1, 0x60 +- /* Store C2 */ +- xvst D8, C2, 0x00 +- xvst D9, C2, 0x20 +- xvst D10, C2, 0x40 +- xvst D11, C2, 0x60 +- /* Store C3 */ +- xvst D12, C3, 0x00 +- xvst D13, C3, 0x20 +- xvst D14, C3, 0x40 +- xvst D15, C3, 0x60 ++ xvld U4, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 + +- /* Add stride for C */ +- addi.d C0, C0, 0x80 +- addi.d C1, C1, 0x80 +- addi.d C2, C2, 0x80 +- addi.d C3, C3, 0x80 ++ xvldrepl.d U8, A0, 0x00 ++ xvldrepl.d U9, A0, 0x08 + +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- sub.d L, K, OFF +-#ifdef LEFT +- /* number of values in A */ +- addi.d L, L, -16 +-#else +- /* number of values in B */ +- addi.d L, L, -4 +-#endif +- slli.d T0, L, 0x07 +- add.d A0, A0, T0 +- slli.d T0, L, 0x05 +- add.d B0, B0, T0 +-#endif ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 + +-#ifdef LEFT +- addi.d OFF, OFF, 0x10 +-#endif +-#endif // #if defined(TRMMKERNEL) ++ xvld U12, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++.endm + +- addi.d I, I, -1 /* I-- */ +- blt ZERO,I, .L_I1 ++.macro KERNEL2x2x4_END ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 + +-.L_M8: +- /* We have done M & 16, considering M=8/4/2/1 */ +- andi I, M, 15 +- beq ZERO,I, .L_M0 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 + +- andi I, M, 8 +- beq ZERO,I, .L_M4 ++ xvld U4, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 + +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- move B0, B +-#else +- slli.d T0, OFF, 0x06 +- add.d A0, A0, T0 +- slli.d T0, OFF, 0x05 +- add.d B0, B, T0 +-#endif ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++.endm + +-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +- sub.d L, K, OFF +-#elif defined(LEFT) +- /* number of values in A */ +- addi.d L, OFF, 8 +-#else +- /* number of values in B */ +- addi.d L, OFF, 4 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif // #if defined(TRMMKERNEL) ++.macro KERNEL8x2x4 ++.rept 4 ++ KERNEL2x2x4 ++.endr ++.endm + +- /* Load 8 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++.macro KERNEL8x2x4_END ++.rept 3 ++ KERNEL2x2x4 ++.endr ++ KERNEL2x2x4_END ++.endm + +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- xvfmul.d D1, U1, U4 ++.macro KERNEL2x1x4 ++ xvldrepl.d U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvld U4, B0, 0x00 + +- xvldrepl.d U4, B0, 0x08 +- /* line 2 */ +- xvfmul.d D4, U0, U4 +- xvfmul.d D5, U1, U4 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 + +- xvldrepl.d U4, B0, 0x10 +- /* line 3 */ +- xvfmul.d D8, U0, U4 +- xvfmul.d D9, U1, U4 ++ xvldrepl.d U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvld U12, B0, 0x00 + +- xvldrepl.d U4, B0, 0x18 +- /* line 4 */ +- xvfmul.d D12, U0, U4 +- xvfmul.d D13, U1, U4 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++.endm + +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_M8_L7 */ +- beq ZERO,TL, .L_M8_L7 ++.macro KERNEL2x1x4_END ++ xvldrepl.d U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvld U4, B0, 0x00 + +-.L_M8_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++.endm ++ ++.macro KERNEL8x1x4 ++.rept 4 ++ KERNEL2x1x4 ++.endr ++.endm ++ ++.macro KERNEL8x1x4_END ++.rept 3 ++ KERNEL2x1x4 ++.endr ++ KERNEL2x1x4_END ++.endm ++ ++.macro KERNEL2x16x2 + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvld U9, A0, 0x20 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvld U10, A0, 0x40 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++ xvld U11, A0, 0x60 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++.endm + +- /***8-2***/ ++.macro KERNEL2x16x2_END + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++.endm + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x16x2 ++.rept 4 ++ KERNEL2x16x2 ++.endr ++.endm ++ ++.macro KERNEL8x16x2_END ++.rept 3 ++ KERNEL2x16x2 ++.endr ++ KERNEL2x16x2_END ++.endm + +- /***8-3***/ ++.macro KERNEL2x8x2 + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvld U9, A0, 0x20 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ addi.d B0, B0, 0x10 ++.endm + +- /***8-4***/ ++.macro KERNEL2x8x2_END + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++.endm + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++.macro KERNEL8x8x2 ++.rept 4 ++ KERNEL2x8x2 ++.endr ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++.macro KERNEL8x8x2_END ++.rept 3 ++ KERNEL2x8x2 ++ .endr ++ KERNEL2x8x2_END ++.endm + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 +- +- /***8-5***/ ++.macro KERNEL2x4x2 + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvldrepl.d U5, B0, 0x08 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++.endm + +- /***8-6***/ ++.macro KERNEL2x4x2_END + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvldrepl.d U5, B0, 0x08 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++.macro KERNEL8x4x2 ++.rept 4 ++ KERNEL2x4x2 ++.endr ++.endm + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x4x2_END ++.rept 3 ++ KERNEL2x4x2 ++.endr ++ KERNEL2x4x2_END ++.endm + +- /***8-7***/ ++.macro KERNEL2x2x2 + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D4, U0, U5, D4 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++.macro KERNEL2x2x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm + +- /***8-8***/ ++.macro KERNEL8x2x2 ++.rept 4 ++ KERNEL2x2x2 ++.endr ++.endm ++ ++.macro KERNEL8x2x2_END ++.rept 3 ++ KERNEL2x2x2 ++.endr ++ KERNEL2x2x2_END ++.endm ++ ++.macro KERNEL2x1x2 + xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D4, U0, U5, D4 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++.macro KERNEL2x1x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 + +- addi.d TL, TL, -1 /* TL-- */ +- blt ZERO,TL, .L_M8_TL1 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 + +-.L_M8_L7: +- /* if (!(L & 7)) goto L_M8_L0 */ +- andi TL, L, 7 +- beq TL, ZERO,.L_M8_L0 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm + +-.L_M8_L71: ++.macro KERNEL8x1x2 ++.rept 4 ++ KERNEL2x1x2 ++.endr ++.endm ++ ++.macro KERNEL8x1x2_END ++.rept 3 ++ KERNEL2x1x2 ++.endr ++ KERNEL2x1x2_END ++.endm ++ ++.macro KERNEL2x16x1 + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ + xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvld U9, A0, 0x20 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- xvfmadd.d D9, U1, U4, D9 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- xvfmadd.d D13, U1, U4, D13 ++ xvldrepl.d U12, B0, 0x00 + +- /* Add stride for A0, B0 */ +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x20 ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++.endm + +- addi.d TL, TL, -1 +- blt ZERO,TL, .L_M8_L71 ++.macro KERNEL2x16x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 + +-.L_M8_L0: +-#if defined(TRMMKERNEL) +- xvfmul.d D0, D0, VALPHA +- xvfmul.d D1, D1, VALPHA +- xvfmul.d D4, D4, VALPHA +- xvfmul.d D5, D5, VALPHA +- xvfmul.d D8, D8, VALPHA +- xvfmul.d D9, D9, VALPHA +- xvfmul.d D12, D12, VALPHA +- xvfmul.d D13, D13, VALPHA +-#else +- /* Load C0 */ +- xvld U0, C0, 0x00 +- xvld U1, C0, 0x20 +- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +- xvfmadd.d D1, D1, VALPHA, U1 ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 + +- /* Load C1 */ +- xvld U0, C1, 0x00 +- xvld U1, C1, 0x20 +- xvfmadd.d D4, D4, VALPHA, U0 +- xvfmadd.d D5, D5, VALPHA, U1 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 + +- /* Load C2 */ +- xvld U0, C2, 0x00 +- xvld U1, C2, 0x20 +- xvfmadd.d D8, D8, VALPHA, U0 +- xvfmadd.d D9, D9, VALPHA, U1 ++ xvldrepl.d U4, B0, 0x00 + +- /* Load C3 */ +- xvld U0, C3, 0x00 +- xvld U1, C3, 0x20 +- xvfmadd.d D12, D12, VALPHA, U0 +- xvfmadd.d D13, D13, VALPHA, U1 +-#endif // #if defined(TRMMKERNEL) ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 + +- /* Store C0 */ +- xvst D0, C0, 0x00 +- xvst D1, C0, 0x20 +- /* Store C1 */ +- xvst D4, C1, 0x00 +- xvst D5, C1, 0x20 +- /* Store C2 */ +- xvst D8, C2, 0x00 +- xvst D9, C2, 0x20 +- /* Store C3 */ +- xvst D12, C3, 0x00 +- xvst D13, C3, 0x20 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 + +- /* Add stride for C */ +- addi.d C0, C0, 0x40 +- addi.d C1, C1, 0x40 +- addi.d C2, C2, 0x40 +- addi.d C3, C3, 0x40 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++.endm + +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- sub.d L, K, OFF +-#ifdef LEFT +- /* number of values in A */ +- addi.d L, L, -8 +-#else +- /* number of values in B */ +- addi.d L, L, -4 +-#endif +- slli.d T0, L, 0x06 +- add.d A0, A0, T0 +- slli.d T0, L, 0x05 +- add.d B0, B0, T0 +-#endif ++.macro KERNEL8x16x1 ++.rept 4 ++ KERNEL2x16x1 ++.endr ++.endm + +-#ifdef LEFT +- /* number of values in A */ +- addi.d OFF, OFF, 0x08 +-#endif +-#endif // #if defined(TRMMKERNEL) ++.macro KERNEL8x16x1_END ++.rept 3 ++ KERNEL2x16x1 ++.endr ++ KERNEL2x16x1_END ++.endm + +-/********LOOP (if(N >> 2 ) && (M & 8)) End************/ ++.macro KERNEL2x8x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ xvld U1, A0, 0x20 ++ xvldrepl.d U4, B0, 0x00 + +-.L_M4: +- andi I, M, 4 +- beq ZERO,I, .L_M2 ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 + +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- move B0, B +-#else +- slli.d T0, OFF, 0x05 +- add.d A0, A0, T0 +- add.d B0, B, T0 +-#endif ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvld U9, A0, 0x20 ++ xvldrepl.d U12, B0, 0x00 + +-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +- sub.d L, K, OFF +-#elif defined(LEFT) +- /* number of values in A */ +- addi.d L, OFF, 4 +-#else +- /* number of values in B */ +- addi.d L, OFF, 4 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++.endm + +- /* Load 4 * 64 from A0 */ ++.macro KERNEL2x8x1_END + xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ xvld U1, A0, 0x20 ++ xvldrepl.d U4, B0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- +- xvldrepl.d U4, B0, 0x08 +- /* line 2 */ +- xvfmul.d D4, U0, U4 ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x10 +- /* line 3 */ +- xvfmul.d D8, U0, U4 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- /* line 4 */ +- xvfmul.d D12, U0, U4 ++.macro KERNEL8x8x1 ++.rept 4 ++ KERNEL2x8x1 ++.endr ++.endm + +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_M4_L7 */ +- beq ZERO,TL, .L_M4_L7 ++.macro KERNEL8x8x1_END ++.rept 3 ++ KERNEL2x8x1 ++.endr ++ KERNEL2x8x1_END ++.endm + +-.L_M4_TL1: /* TL-- */ +- /***8-1***/ ++.macro KERNEL2x4x1 + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++ addi.d B0, B0, 0x08 ++.endm + +- /***8-2***/ ++.macro KERNEL2x4x1_END + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D0, U0, U4, D0 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++.macro KERNEL8x4x1 ++.rept 4 ++ KERNEL2x4x1 ++.endr ++.endm + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x4x1_END ++.rept 3 ++ KERNEL2x4x1 ++.endr ++ KERNEL2x4x1_END ++.endm + +- /***8-3***/ ++.macro KERNEL2x2x1 + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++.endm + +- /***8-4***/ ++.macro KERNEL2x2x1_END + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvfmadd.d D0, U0, U4, D0 ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++.macro KERNEL8x2x1 ++.rept 4 ++ KERNEL2x2x1 ++.endr ++.endm + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++.macro KERNEL8x2x1_END ++.rept 3 ++ KERNEL2x2x1 ++.endr ++ KERNEL2x2x1_END ++.endm + +- /***8-5***/ ++.macro KERNEL2x1x1 + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++.endm + +- /***8-6***/ ++.macro KERNEL2x1x1_END + xvld U0, A0, 0x00 +- ++ xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ + xvfmadd.d D0, U0, U4, D0 ++.endm + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++.macro KERNEL8x1x1 ++.rept 4 ++ KERNEL2x1x1 ++.endr ++.endm + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++.macro KERNEL8x1x1_END ++.rept 3 ++ KERNEL2x1x1 ++.endr ++ KERNEL2x1x1_END ++.endm + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++ PROLOGUE + +- /***8-7***/ +- xvld U0, A0, 0x00 ++ addi.d $sp, $sp, -120 ++ /* Store regs */ ++ SDARG $r23, $sp, 0 ++ SDARG $r24, $sp, 8 ++ SDARG $r25, $sp, 16 ++ SDARG $r26, $sp, 24 ++ SDARG $r27, $sp, 32 ++ ST $f23, $sp, 40 ++ ST $f24, $sp, 48 ++ ST $f25, $sp, 56 ++ ST $f26, $sp, 64 ++ ST $f27, $sp, 72 ++ ST $f28, $sp, 80 ++ ST $f29, $sp, 88 ++ ST $f30, $sp, 96 ++ ST $f31, $sp, 104 ++ ST ALPHA, $sp, 112 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++#if defined (TRMMKERNEL) && !defined(LEFT) ++ sub.d OFF, ZERO, OFFSET ++#else ++ xor OFF, OFF, OFF ++#endif + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ /* if (!(N >> 2)) goto L_N3 */ ++ srai.d J, N, 2 /* J = bn >> 2 */ ++ andi N, N, 0x03 ++ xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ ++ beq ZERO, J, .L_N3 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++.L_J1: /* J-- && This loop include Condition 1 */ + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* ++* dgemm_core_16x4 */ ++ move C0, C ++ move A0, A ++ slli.d T0, LDC, 3 ++ add.d C1, C0, T0 ++ addi.d J, J, -1 /* J-- */ ++ add.d C2, C1, T0 ++ add.d C3, C2, T0 + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x20 ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_M8 */ ++ srai.d I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_M8 + +- /***8-8***/ ++.L_I1: /* I-- */ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x07 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 16 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ /* Calculate the first set of D0~D15, ++ * avoidig set 0 operation ++ * Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ + xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U4, B0, 0x00 ++ preld 0, C0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ preld 0, C0, 0x40 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ preld 0, C1, 0x00 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ preld 0, C1, 0x40 ++ xvfmul.d D6, U2, U5 ++ xvfmul.d D7, U3, U5 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvldrepl.d U6, B0, 0x10 ++ preld 0, C2, 0x00 ++ /* line 3 */ ++ xvfmul.d D8, U0, U6 ++ xvfmul.d D9, U1, U6 ++ preld 0, C2, 0x40 ++ xvfmul.d D10, U2, U6 ++ xvfmul.d D11, U3, U6 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvldrepl.d U7, B0, 0x18 ++ preld 0, C3, 0x00 ++ /* line 4 */ ++ xvfmul.d D12, U0, U7 ++ xvfmul.d D13, U1, U7 ++ preld 0, C3, 0x40 ++ xvfmul.d D14, U2, U7 ++ xvfmul.d D15, U3, U7 + +- addi.d A0, A0, 0x20 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_L7 */ ++ beq ZERO,TL, .L_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 ++ ++ addi.d TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + ++ beq ZERO, TL, .L_TL1_END ++.L_TL1: /* TL-- */ ++ KERNEL8x16x4 + addi.d TL, TL, -1 /* TL-- */ +- blt ZERO,TL, .L_M4_TL1 ++ blt ZERO,TL, .L_TL1 + +-.L_M4_L7: +- /* if (!(L & 7)) goto L_M4_L0 */ ++.L_TL1_END: ++ KERNEL8x16x4_END ++ ++ /* Maybe we need calculate the last ++ * 7 sets of D0~D15? ++ */ ++.L_L7: ++ /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 +- beq TL, ZERO,.L_M4_L0 ++ beq TL, ZERO,.L_L0 + +-.L_M4_L71: ++.L_L71: ++ /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 + ++ /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ +- addi.d A0, A0, 0x20 ++ addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 +- blt ZERO,TL, .L_M4_L71 ++ blt ZERO,TL, .L_L71 + +-.L_M4_L0: ++.L_L0: ++ xvldrepl.d VALPHA, $sp, 112 + #if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA ++ xvfmul.d D2, D2, VALPHA ++ xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA ++ xvfmul.d D6, D6, VALPHA ++ xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA ++ xvfmul.d D9, D9, VALPHA ++ xvfmul.d D10, D10, VALPHA ++ xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA ++ xvfmul.d D13, D13, VALPHA ++ xvfmul.d D14, D14, VALPHA ++ xvfmul.d D15, D15, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 ++ xvfmadd.d D2, D2, VALPHA, U2 ++ xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++ xvfmadd.d D4, D4, VALPHA, U4 ++ xvfmadd.d D5, D5, VALPHA, U5 ++ xvfmadd.d D6, D6, VALPHA, U6 ++ xvfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ +- xvld U0, C2, 0x00 +- xvfmadd.d D8, D8, VALPHA, U0 ++ xvld U8, C2, 0x00 ++ xvld U9, C2, 0x20 ++ xvld U10, C2, 0x40 ++ xvld U11, C2, 0x60 ++ xvfmadd.d D8, D8, VALPHA, U8 ++ xvfmadd.d D9, D9, VALPHA, U9 ++ xvfmadd.d D10, D10, VALPHA, U10 ++ xvfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + xvld U0, C3, 0x00 ++ xvld U1, C3, 0x20 ++ xvld U2, C3, 0x40 ++ xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 +-#endif // #if defined(TRMMKERNEL) ++ xvfmadd.d D13, D13, VALPHA, U1 ++ xvfmadd.d D14, D14, VALPHA, U2 ++ xvfmadd.d D15, D15, VALPHA, U3 ++#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ xvst D2, C0, 0x40 ++ xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ xvst D6, C1, 0x40 ++ xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 ++ xvst D9, C2, 0x20 ++ xvst D10, C2, 0x40 ++ xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 ++ xvst D13, C3, 0x20 ++ xvst D14, C3, 0x40 ++ xvst D15, C3, 0x60 + + /* Add stride for C */ +- addi.d C0, C0, 0x20 +- addi.d C1, C1, 0x20 +- addi.d C2, C2, 0x20 +- addi.d C3, C3, 0x20 ++ addi.d C0, C0, 0x80 ++ addi.d C1, C1, 0x80 ++ addi.d C2, C2, 0x80 ++ addi.d C3, C3, 0x80 + + #if defined(TRMMKERNEL) + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF + #ifdef LEFT +- /* number of values in A */ +- addi.d L, L, -4 ++ /* number of values in A */ ++ addi.d L, L, -16 + #else + /* number of values in B */ + addi.d L, L, -4 + #endif +- slli.d T0, L, 0x05 ++ slli.d T0, L, 0x07 + add.d A0, A0, T0 ++ slli.d T0, L, 0x05 + add.d B0, B0, T0 + #endif + + #ifdef LEFT +- /* number of values in A */ +- addi.d OFF, OFF, 0x04 ++ addi.d OFF, OFF, 0x10 + #endif + #endif // #if defined(TRMMKERNEL) + +-/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ ++ addi.d I, I, -1 /* I-- */ ++ blt ZERO,I, .L_I1 + +-.L_M2: +- andi I, M, 2 +- beq ZERO,I, .L_M1 ++.L_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_M4 + + #if defined(TRMMKERNEL) + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B + #else +- slli.d T0, OFF, 0x04 ++ slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + sub.d L, K, OFF + #elif defined(LEFT) + /* number of values in A */ +- addi.d L, OFF, 2 ++ addi.d L, OFF, 8 + #else + /* number of values in B */ + addi.d L, OFF, 4 +@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +-#endif ++#endif // #if defined(TRMMKERNEL) + +- /* Load 2 * 64 from A0 */ ++ /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 + +- xvldrepl.d U4, B0, 0x08 ++ xvldrepl.d U5, B0, 0x08 + /* line 2 */ +- xvfmul.d D4, U0, U4 ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 + +- xvldrepl.d U4, B0, 0x10 ++ xvldrepl.d U6, B0, 0x10 + /* line 3 */ +- xvfmul.d D8, U0, U4 ++ xvfmul.d D8, U0, U6 ++ xvfmul.d D9, U1, U6 + +- xvldrepl.d U4, B0, 0x18 ++ xvldrepl.d U7, B0, 0x18 + /* line 4 */ +- xvfmul.d D12, U0, U4 ++ xvfmul.d D12, U0, U7 ++ xvfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x10 ++ addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_M2_L7 */ +- beq ZERO,TL, .L_M2_L7 +- +-.L_M2_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 2 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 +- +- /***8-7***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ /* if (TL < 1) goto L_M8_L7 */ ++ beq ZERO,TL, .L_M8_L7 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ addi.d TL, TL, -1 + +- addi.d A0, A0, 0x10 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ beq ZERO, TL, .L_M8_TL1_END + +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x20 ++.L_M8_TL1: /* TL-- */ ++ KERNEL8x8x4 + + addi.d TL, TL, -1 /* TL-- */ +- blt ZERO,TL, .L_M2_TL1 ++ blt ZERO,TL, .L_M8_TL1 + +-.L_M2_L7: +- /* if (!(L & 7)) goto L_M2_L0 */ ++.L_M8_TL1_END: ++ KERNEL8x8x4_END ++ ++.L_M8_L7: ++ /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 +- beq TL, ZERO,.L_M2_L0 ++ beq TL, ZERO,.L_M8_L0 + +-.L_M2_L71: ++.L_M8_L71: + xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ +- addi.d A0, A0, 0x10 ++ addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 +- blt ZERO,TL, .L_M2_L71 ++ blt ZERO,TL, .L_M8_L71 + +-.L_M2_L0: ++.L_M8_L0: ++ xvldrepl.d VALPHA, $sp, 112 + #if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA ++ xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA ++ xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA ++ xvfmul.d D13, D13, VALPHA + #else + /* Load C0 */ + xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ xvfmadd.d D4, D4, VALPHA, U2 ++ xvfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ +- xvld U0, C2, 0x00 +- xvfmadd.d D8, D8, VALPHA, U0 ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ xvfmadd.d D8, D8, VALPHA, U4 ++ xvfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ +- xvld U0, C3, 0x00 +- xvfmadd.d D12, D12, VALPHA, U0 ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++ xvfmadd.d D12, D12, VALPHA, U6 ++ xvfmadd.d D13, D13, VALPHA, U7 + #endif // #if defined(TRMMKERNEL) + +- xvstelm.d D0, C0, 0x00, 0x00 +- xvstelm.d D4, C1, 0x00, 0x00 +- xvstelm.d D8, C2, 0x00, 0x00 +- xvstelm.d D12, C3, 0x00, 0x00 +- xvstelm.d D0, C0, 0x08, 0x01 +- xvstelm.d D4, C1, 0x08, 0x01 +- xvstelm.d D8, C2, 0x08, 0x01 +- xvstelm.d D12, C3, 0x08, 0x01 ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ xvst D1, C0, 0x20 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ xvst D5, C1, 0x20 ++ /* Store C2 */ ++ xvst D8, C2, 0x00 ++ xvst D9, C2, 0x20 ++ /* Store C3 */ ++ xvst D12, C3, 0x00 ++ xvst D13, C3, 0x20 + + /* Add stride for C */ +- addi.d C0, C0, 0x10 +- addi.d C1, C1, 0x10 +- addi.d C2, C2, 0x10 +- addi.d C3, C3, 0x10 ++ addi.d C0, C0, 0x40 ++ addi.d C1, C1, 0x40 ++ addi.d C2, C2, 0x40 ++ addi.d C3, C3, 0x40 + + #if defined(TRMMKERNEL) + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF + #ifdef LEFT + /* number of values in A */ +- addi.d L, L, -2 ++ addi.d L, L, -8 + #else + /* number of values in B */ + addi.d L, L, -4 + #endif +- slli.d T0, L, 0x04 ++ slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #ifdef LEFT + /* number of values in A */ +- addi.d OFF, OFF, 0x02 ++ addi.d OFF, OFF, 0x08 + #endif + #endif // #if defined(TRMMKERNEL) + +-/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ ++/********LOOP (if(N >> 2 ) && (M & 8)) End************/ + +-.L_M1: +- andi I, M, 1 +- beq ZERO,I, .L_M0 ++.L_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_M2 + + #if defined(TRMMKERNEL) + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B + #else +- slli.d T0, OFF, 0x03 +- add.d A0, A0, T0 + slli.d T0, OFF, 0x05 ++ add.d A0, A0, T0 + add.d B0, B, T0 + #endif + +@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + sub.d L, K, OFF + #elif defined(LEFT) + /* number of values in A */ +- addi.d L, OFF, 1 ++ addi.d L, OFF, 4 + #else + /* number of values in B */ + addi.d L, OFF, 4 +@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + move L, K /* L = bk */ + #endif + +- /* Load 1 * 64 from A0 */ ++ /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + +- xvldrepl.d U4, B0, 0x08 ++ xvldrepl.d U5, B0, 0x08 + /* line 2 */ +- xvfmul.d D4, U0, U4 ++ xvfmul.d D4, U0, U5 + +- xvldrepl.d U4, B0, 0x10 ++ xvldrepl.d U6, B0, 0x10 + /* line 3 */ +- xvfmul.d D8, U0, U4 ++ xvfmul.d D8, U0, U6 + +- xvldrepl.d U4, B0, 0x18 ++ xvldrepl.d U7, B0, 0x18 + /* line 4 */ +- xvfmul.d D12, U0, U4 ++ xvfmul.d D12, U0, U7 + + /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x08 ++ addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_M1_L7 */ +- beq ZERO,TL, .L_M1_L7 ++ /* if (TL < 1) goto L_M4_L7 */ ++ beq ZERO,TL, .L_M4_L7 + +-.L_M1_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 1 * 64 from A0 */ +- xvld U0, A0, 0x00 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ addi.d TL, TL, -1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ beq ZERO, TL, .L_M4_TL1_END + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++.L_M4_TL1: /* TL-- */ ++ KERNEL8x4x4 + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x20 ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M4_TL1 ++ ++.L_M4_TL1_END: ++ KERNEL8x4x4_END ++ ++.L_M4_L7: ++ /* if (!(L & 7)) goto L_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M4_L0 + +- /***8-2***/ ++.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 +@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + +- addi.d A0, A0, 0x08 ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + +- /***8-3***/ +- xvld U0, A0, 0x00 ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M4_L71 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++.L_M4_L0: ++ xvldrepl.d VALPHA, $sp, 112 ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D4, D4, VALPHA ++ xvfmul.d D8, D8, VALPHA ++ xvfmul.d D12, D12, VALPHA ++#else ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U1 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ xvfmadd.d D8, D8, VALPHA, U2 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++ xvfmadd.d D12, D12, VALPHA, U3 ++#endif // #if defined(TRMMKERNEL) + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x20 ++ /* Store C0 */ ++ xvst D0, C0, 0x00 ++ /* Store C1 */ ++ xvst D4, C1, 0x00 ++ /* Store C2 */ ++ xvst D8, C2, 0x00 ++ /* Store C3 */ ++ xvst D12, C3, 0x00 + +- /***8-4***/ +- xvld U0, A0, 0x00 ++ /* Add stride for C */ ++ addi.d C0, C0, 0x20 ++ addi.d C1, C1, 0x20 ++ addi.d C2, C2, 0x20 ++ addi.d C3, C3, 0x20 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -4 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x05 ++ add.d A0, A0, T0 ++ add.d B0, B0, T0 ++#endif + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++.L_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_M1 + +- addi.d A0, A0, 0x08 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 2 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ ++ /* Load 2 * 64 from A0 */ ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 ++ ++ xvld U4, B0, 0x00 ++ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M2_L7 */ ++ beq ZERO,TL, .L_M2_L7 ++ ++ xvldrepl.d U8, A0, 0x00 ++ xvldrepl.d U9, A0, 0x08 ++ ++ addi.d TL, TL, -1 ++ ++ xvld U12, B0, 0x00 ++ addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + +- /***8-5***/ +- xvld U0, A0, 0x00 ++ beq ZERO, TL, .L_M2_TL1_END ++.L_M2_TL1: /* TL-- */ ++ KERNEL8x2x4 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M2_TL1 ++.L_M2_TL1_END: ++ KERNEL8x2x4_END + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++.L_M2_L7: ++ /* if (!(L & 7)) goto L_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M2_L0 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++.L_M2_L71: ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvld U4, B0, 0x00 + +- addi.d A0, A0, 0x08 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + +- /***8-6***/ +- xvld U0, A0, 0x00 ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_M2_L71 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++.L_M2_L0: ++ xvldrepl.d VALPHA, $sp, 112 ++#if defined(TRMMKERNEL) ++ xvfmul.d D0, D0, VALPHA ++ xvfmul.d D1, D1, VALPHA + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D0, C1, 0x00, 0x01 ++ xvstelm.d D0, C2, 0x00, 0x02 ++ xvstelm.d D0, C3, 0x00, 0x03 ++ xvstelm.d D1, C0, 0x08, 0x00 ++ xvstelm.d D1, C1, 0x08, 0x01 ++ xvstelm.d D1, C2, 0x08, 0x02 ++ xvstelm.d D1, C3, 0x08, 0x03 ++#else ++ xvpackev.d D4, D1, D0 ++ xvpackod.d D5, D1, D0 ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvpermi.q U2, U0, 0x20 ++ xvpermi.q U3, U1, 0x20 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvfmadd.d D0, D4, VALPHA, U2 ++ xvfmadd.d D1, D5, VALPHA, U3 + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x20 ++ vst $vr16, C0, 0x00 ++ vst $vr17, C1, 0x00 ++ xvstelm.d D0, C2, 0x00, 0x02 ++ xvstelm.d D1, C3, 0x00, 0x02 ++ xvstelm.d D0, C2, 0x08, 0x03 ++ xvstelm.d D1, C3, 0x08, 0x03 ++#endif // #if defined(TRMMKERNEL) + +- /***8-7***/ +- xvld U0, A0, 0x00 ++ /* Add stride for C */ ++ addi.d C0, C0, 0x10 ++ addi.d C1, C1, 0x10 ++ addi.d C2, C2, 0x10 ++ addi.d C3, C3, 0x10 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ sub.d L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d L, L, -2 ++#else ++ /* number of values in B */ ++ addi.d L, L, -4 ++#endif ++ slli.d T0, L, 0x04 ++ add.d A0, A0, T0 ++ slli.d T0, L, 0x05 ++ add.d B0, B0, T0 ++#endif + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++#ifdef LEFT ++ /* number of values in A */ ++ addi.d OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++.L_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_M0 + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x20 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x03 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x05 ++ add.d B0, B, T0 ++#endif + +- /***8-8***/ +- xvld U0, A0, 0x00 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 1 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U0, A0, 0x00 ++ xvld U4, B0, 0x00 ++ xvfmul.d D0, U0, U4 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M1_L7 */ ++ beq ZERO,TL, .L_M1_L7 + +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 ++ xvldrepl.d U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ addi.d TL, TL, -1 ++ xvld U12, B0, 0x00 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x20 ++ beq ZERO, TL, .L_M1_TL1_END ++ ++.L_M1_TL1: /* TL-- */ ++ KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 ++.L_M1_TL1_END: ++ KERNEL8x1x4_END + + .L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ +@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + beq TL, ZERO,.L_M1_L0 + + .L_M1_L71: +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- xvldrepl.d U4, B0, 0x10 +- xvfmadd.d D8, U0, U4, D8 +- +- xvldrepl.d U4, B0, 0x18 +- xvfmadd.d D12, U0, U4, D12 ++ xvldrepl.d U0, A0, 0x00 ++ xvld U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 +@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + blt ZERO,TL, .L_M1_L71 + + .L_M1_L0: ++ xvldrepl.d VALPHA, $sp, 112 + #if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +- xvfmul.d D4, D4, VALPHA +- xvfmul.d D8, D8, VALPHA +- xvfmul.d D12, D12, VALPHA ++ ++ xvstelm.d D0, C0, 0x00, 0x00 ++ xvstelm.d D0, C1, 0x00, 0x01 ++ xvstelm.d D0, C2, 0x00, 0x02 ++ xvstelm.d D0, C3, 0x00, 0x03 + #else + /* Load C0 */ +- xvld U0, C0, 0x00 +- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ ++ xvldrepl.d U0, C0, 0x00 ++ xvfmadd.d D4, D0, VALPHA, U0 + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvldrepl.d U1, C1, 0x00 ++ xvfmadd.d D5, D0, VALPHA, U1 + + /* Load C2 */ +- xvld U0, C2, 0x00 +- xvfmadd.d D8, D8, VALPHA, U0 ++ xvldrepl.d U2, C2, 0x00 ++ xvfmadd.d D6, D0, VALPHA, U2 + + /* Load C3 */ +- xvld U0, C3, 0x00 +- xvfmadd.d D12, D12, VALPHA, U0 +-#endif // #if defined(TRMMKERNEL) ++ xvldrepl.d U3, C3, 0x00 ++ xvfmadd.d D7, D0, VALPHA, U3 + +- xvstelm.d D0, C0, 0x00, 0x00 +- xvstelm.d D4, C1, 0x00, 0x00 +- xvstelm.d D8, C2, 0x00, 0x00 +- xvstelm.d D12, C3, 0x00, 0x00 ++ xvstelm.d D4, C0, 0x00, 0x00 ++ xvstelm.d D5, C1, 0x00, 0x01 ++ xvstelm.d D6, C2, 0x00, 0x02 ++ xvstelm.d D7, C3, 0x00, 0x03 ++#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 +@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ///////////////////////////////////////////////// + /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + ++ xvldrepl.d VALPHA, $sp, 112 ++ + .L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 +@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + addi.d L, OFF, 2 + #endif + #else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif +- +- /* Load 16 * 64 from A0 +- * U0 = {a3, a2, a1, a0} +- * U1 = {a7, a6, a5, a4} +- * U2 = {a11, a10, a9, a8} +- * U3 = {a15, a14, a13, a12} +- */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- xvfmul.d D1, U1, U4 +- xvfmul.d D2, U2, U4 +- xvfmul.d D3, U3, U4 +- +- xvldrepl.d U4, B0, 0x08 +- /* line 2 */ +- xvfmul.d D4, U0, U4 +- xvfmul.d D5, U1, U4 +- xvfmul.d D6, U2, U4 +- xvfmul.d D7, U3, U4 +- +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_N3_L7 */ +- beq ZERO,TL, .L_N3_L7 +- +-.L_N3_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- +- /***8-2***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- +- /***8-3***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- +- /***8-4***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- +- /***8-5***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 +- +- /***8-6***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 ++ move B0, B ++ move L, K /* L = bk */ ++#endif + +- /***8-7***/ +- /* Load 16 * 64 from A0 */ ++ /* Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 + +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x10 ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ xvfmul.d D6, U2, U5 ++ xvfmul.d D7, U3, U5 + +- /***8-8***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_L7 */ ++ beq ZERO,TL, .L_N3_L7 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + ++ beq ZERO, TL, .L_N3_TL1_END ++ ++.L_N3_TL1: /* TL-- */ ++ KERNEL8x16x2 ++ + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 ++.L_N3_TL1_END: ++ KERNEL8x16x2_END + + .L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ +@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- xvfmadd.d D6, U2, U4, D6 +- xvfmadd.d D7, U3, U4, D7 +- ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvld U1, C1, 0x20 +- xvld U2, C1, 0x40 +- xvld U3, C1, 0x60 +- xvfmadd.d D4, D4, VALPHA, U0 +- xvfmadd.d D5, D5, VALPHA, U1 +- xvfmadd.d D6, D6, VALPHA, U2 +- xvfmadd.d D7, D7, VALPHA, U3 ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++ xvfmadd.d D4, D4, VALPHA, U4 ++ xvfmadd.d D5, D5, VALPHA, U5 ++ xvfmadd.d D6, D6, VALPHA, U6 ++ xvfmadd.d D7, D7, VALPHA, U7 + #endif // #if defined(TRMMKERNEL) + + /* Store C0 */ +@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + +- xvldrepl.d U4, B0, 0x08 ++ xvldrepl.d U5, B0, 0x08 + /* line 2 */ +- xvfmul.d D4, U0, U4 +- xvfmul.d D5, U1, U4 ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 +@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +-.L_N3_M8_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- /* Cumulative D0~D15 */ +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 +- +- /***8-7***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ beq ZERO, TL, .L_N3_M8_TL1_END + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x10 ++.L_N3_M8_TL1: /* TL-- */ ++ KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 ++.L_N3_M8_TL1_END: ++ KERNEL8x8x2_END + + .L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ +@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- xvfmadd.d D5, U1, U4, D5 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 +@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvld U1, C1, 0x20 +- xvfmadd.d D4, D4, VALPHA, U0 +- xvfmadd.d D5, D5, VALPHA, U1 ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ xvfmadd.d D4, D4, VALPHA, U2 ++ xvfmadd.d D5, D5, VALPHA, U3 + #endif // #if defined(TRMMKERNEL) + + /* Store C0 */ +@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + add.d B0, B0, T0 + #endif + +-#ifdef LEFT +- addi.d OFF, OFF, 0x08 +-#endif +-#endif // #if defined(TRMMKERNEL) +- +-/********LOOP (if(N & 2) && (M & 8) ) End************/ +- +-.L_N3_M4: +- andi I, M, 4 +- beq ZERO,I, .L_N3_M2 +- +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- move B0, B +-#else +- slli.d T0, OFF, 0x05 +- add.d A0, A0, T0 +- slli.d T0, OFF, 0x04 +- add.d B0, B, T0 +-#endif +- +-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +- sub.d L, K, OFF +-#elif defined(LEFT) +- /* number of values in A */ +- addi.d L, OFF, 4 +-#else +- /* number of values in B */ +- addi.d L, OFF, 2 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif +- +- /* Load 4 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- +- xvldrepl.d U4, B0, 0x08 +- /* line 2 */ +- xvfmul.d D4, U0, U4 +- +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_N3_M4_L7 */ +- beq ZERO,TL, .L_N3_M4_L7 +- +-.L_N3_M4_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 8 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 ++#ifdef LEFT ++ addi.d OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) + +- /***8-5***/ +- xvld U0, A0, 0x00 ++/********LOOP (if(N & 2) && (M & 8) ) End************/ + +- /* Cumulative D0~D15 */ +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++.L_N3_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N3_M2 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x05 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x04 ++ add.d B0, B, T0 ++#endif + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 4 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif + +- /***8-6***/ ++ /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 + +- /***8-7***/ +- xvld U0, A0, 0x00 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M4_L7 */ ++ beq ZERO,TL, .L_N3_M4_L7 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ beq ZERO, TL, .L_N3_M4_TL1_END + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x10 ++.L_N3_M4_TL1: /* TL-- */ ++ KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 ++.L_N3_M4_TL1_END: ++ KERNEL8x4x2_END + + .L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ +@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 +@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvld U1, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U1 + #endif // #if defined(TRMMKERNEL) + + /* Store C0 */ +@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +-.L_N3_M2_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 2 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 ++ xvld U8, A0, 0x00 + +- /***8-7***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ beq ZERO, TL, .L_N3_M2_TL1_END + +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x10 ++.L_N3_M2_TL1: /* TL-- */ ++ KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 ++.L_N3_M2_TL1_END: ++ KERNEL8x2x2_END + + .L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ +@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 +@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvld U1, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U1 + #endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 +@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #else + /* number of values in B */ + addi.d L, OFF, 2 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif +- +- /* Load 1 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- +- xvldrepl.d U4, B0, 0x08 +- /* line 2 */ +- xvfmul.d D4, U0, U4 +- +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_N3_M1_L7 */ +- beq ZERO,TL, .L_N3_M1_L7 +- +-.L_N3_M1_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 1 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif + +- /***8-7***/ ++ /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x10 ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 + +- /***8-8***/ +- xvld U0, A0, 0x00 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M1_L7 */ ++ beq ZERO,TL, .L_N3_M1_L7 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + ++ beq ZERO, TL, .L_N3_M1_TL1_END ++ ++.L_N3_M1_TL1: /* TL-- */ ++ KERNEL8x1x2 ++ + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 ++.L_N3_M1_TL1_END: ++ KERNEL8x1x2_END + + .L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ +@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + +- xvldrepl.d U4, B0, 0x08 +- xvfmadd.d D4, U0, U4, D4 ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 +@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ +- xvld U0, C1, 0x00 +- xvfmadd.d D4, D4, VALPHA, U0 ++ xvld U1, C1, 0x00 ++ xvfmadd.d D4, D4, VALPHA, U1 + #endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 +@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +-.L_N1_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-2***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-3***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-4***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-5***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-6***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 +- +- /***8-7***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + +- /***8-8***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- xvld U2, A0, 0x40 +- xvld U3, A0, 0x60 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- xvfmadd.d D2, U2, U4, D2 +- xvfmadd.d D3, U3, U4, D3 +- +- addi.d A0, A0, 0x80 +- addi.d B0, B0, 0x08 ++ beq ZERO, TL, .L_N1_TL1_END ++.L_N1_TL1: /* TL-- */ ++ KERNEL8x16x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 ++.L_N1_TL1_END: ++ KERNEL8x16x1_END + + .L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ +@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 +- slli.d T0, L, 0x03 +- add.d B0, B0, T0 +-#endif +- +-#ifdef LEFT +- addi.d OFF, OFF, 0x10 +-#endif +-#endif // #if defined(TRMMKERNEL) +- +- addi.d I, I, -1 /* I-- */ +- blt ZERO,I, .L_N1_I1 +- +-.L_N1_M8: +- /* We have done M & 16, considering M=8/4/2/1 */ +- andi I, M, 15 +- beq ZERO,I, .L_N1_M0 +- +- andi I, M, 8 +- beq ZERO,I, .L_N1_M4 +- +-#if defined(TRMMKERNEL) +-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +- move B0, B +-#else +- slli.d T0, OFF, 0x06 +- add.d A0, A0, T0 +- slli.d T0, OFF, 0x03 +- add.d B0, B, T0 +-#endif +- +-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +- sub.d L, K, OFF +-#elif defined(LEFT) +- /* number of values in A */ +- addi.d L, OFF, 8 +-#else +- /* number of values in B */ +- addi.d L, OFF, 1 +-#endif +-#else // #if !defined(TRMMKERNEL) +- move B0, B +- move L, K /* L = bk */ +-#endif +- +- /* Load 8 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- /* line 1 */ +- xvfmul.d D0, U0, U4 +- xvfmul.d D1, U1, U4 +- +- /* Add stride for A0 and B0 */ +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 +- /* Reduce L */ +- addi.d L, L, -1 +- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ +- /* if (TL < 1) goto L_N1_M8_L7 */ +- beq ZERO,TL, .L_N1_M8_L7 +- +-.L_N1_M8_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 16 * 64 from A0 */ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 +- +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 ++ slli.d T0, L, 0x03 ++ add.d B0, B0, T0 ++#endif + +- /***8-5***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++#ifdef LEFT ++ addi.d OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ addi.d I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N1_I1 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 ++.L_N1_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N1_M0 + +- /***8-6***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ andi I, M, 8 ++ beq ZERO,I, .L_N1_M4 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ slli.d T0, OFF, 0x06 ++ add.d A0, A0, T0 ++ slli.d T0, OFF, 0x03 ++ add.d B0, B, T0 ++#endif + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ sub.d L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ addi.d L, OFF, 8 ++#else ++ /* number of values in B */ ++ addi.d L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif + +- /***8-7***/ ++ /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 + +- addi.d A0, A0, 0x40 +- addi.d B0, B0, 0x08 ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M8_L7 */ ++ beq ZERO,TL, .L_N1_M8_L7 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- xvld U1, A0, 0x20 ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- xvfmadd.d D1, U1, U4, D1 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + ++ beq ZERO, TL, .L_N1_M8_TL1_END ++.L_N1_M8_TL1: /* TL-- */ ++ KERNEL8x8x1 ++ + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + ++.L_N1_M8_TL1_END: ++ KERNEL8x8x1_END ++ + .L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 +@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +-.L_N1_M4_TL1: /* TL-- */ +- /***8-1***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 +- +- /***8-7***/ +- xvld U0, A0, 0x00 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ beq ZERO, TL, .L_N1_M4_TL1_END + +- addi.d A0, A0, 0x20 +- addi.d B0, B0, 0x08 ++.L_N1_M4_TL1: /* TL-- */ ++ KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 ++.L_N1_M4_TL1_END: ++ KERNEL8x4x1_END + + .L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ +@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +-.L_N1_M2_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 2 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 +- +- /***8-7***/ +- xvld U0, A0, 0x00 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ beq ZERO, TL, .L_N1_M2_TL1_END + +- addi.d A0, A0, 0x10 +- addi.d B0, B0, 0x08 ++.L_N1_M2_TL1: /* TL-- */ ++ KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 ++.L_N1_M2_TL1_END: ++ KERNEL8x2x1_END + + .L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ +@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +-.L_N1_M1_TL1: /* TL-- */ +- /***8-1***/ +- /* Load 1 * 64 from A0 */ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-2***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-3***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-4***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-5***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-6***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 +- +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 +- +- /***8-7***/ +- xvld U0, A0, 0x00 ++ xvld U8, A0, 0x00 + +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ addi.d TL, TL, -1 + ++ xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + +- /***8-8***/ +- xvld U0, A0, 0x00 +- +- xvldrepl.d U4, B0, 0x00 +- xvfmadd.d D0, U0, U4, D0 ++ beq ZERO, TL, .L_N1_M1_TL1_END + +- addi.d A0, A0, 0x08 +- addi.d B0, B0, 0x08 ++.L_N1_M1_TL1: /* TL-- */ ++ KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 ++.L_N1_M1_TL1_END: ++ KERNEL8x1x1_END + + .L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ +@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f23, $sp, 40 +- addi.d $sp, $sp, 56 ++ LD $f24, $sp, 48 ++ LD $f25, $sp, 56 ++ LD $f26, $sp, 64 ++ LD $f27, $sp, 72 ++ LD $f28, $sp, 80 ++ LD $f29, $sp, 88 ++ LD $f30, $sp, 96 ++ LD $f31, $sp, 104 ++ addi.d $sp, $sp, 120 + + jirl $r0, $r1, 0x0 + +diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S +new file mode 100644 +index 000000000..a49bf9bb1 +--- /dev/null ++++ b/kernel/loongarch64/dgemv_n_8_lasx.S +@@ -0,0 +1,554 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/07/14 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, ++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) ++ */ ++#define M $r4 ++#define N $r5 ++#define ALPHA $f0 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INC_X $r10 ++#define Y $r11 ++#define INC_Y $r6 ++ ++#define J $r12 ++#define I $r13 ++#define K $r14 ++#define Y_ORG $r15 ++#define OFFSET $r16 ++#define K_LDA $r17 ++#define M8 $r18 ++#define T0 $r19 ++#define PA0 $r20 ++#define PA1 $r23 ++#define PA2 $r24 ++#define PA3 $r25 ++#define PA4 $r26 ++#define PA5 $r27 ++#define PA6 $r28 ++#define PA7 $r29 ++ ++#define VALPHA $xr1 ++#define X0 $xr2 ++#define X1 $xr3 ++#define X2 $xr4 ++#define X3 $xr5 ++#define X4 $xr6 ++#define X5 $xr7 ++#define X6 $xr8 ++#define X7 $xr9 ++#define Y0 $xr10 ++#define Y1 $xr11 ++#define A0 $xr12 ++#define A1 $xr13 ++#define A2 $xr14 ++#define A3 $xr15 ++#define A4 $xr16 ++#define A5 $xr17 ++#define A6 $xr18 ++#define A7 $xr19 ++#define A8 $xr20 ++#define A9 $xr21 ++#define A10 $xr22 ++#define A11 $xr23 ++#define A12 $xr24 ++#define A13 $xr25 ++#define A14 $xr26 ++#define A15 $xr27 ++ ++.macro DLOAD_X_8 ++ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ ++ X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ ++ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA ++.endm ++ ++.macro DLOAD_X_4 ++ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA ++.endm ++ ++.macro DLOAD_X_2 ++ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA ++.endm ++ ++.macro DLOAD_X_1 ++ GLDREPL xv, d, X0, X, 0x00 ++ GMUL xvf, d, X0, X0, VALPHA ++.endm ++ ++.macro DLOAD_Y_8 ++ GLD xv, , Y0, Y, 0, Y1, Y, 0x20 ++.endm ++ ++.macro DLOAD_Y_4 ++ GLD xv, , Y0, Y, 0 ++.endm ++ ++.macro DLOAD_Y_1 ++ fld.d $f10, Y, 0 ++.endm ++ ++.macro DSTORE_Y_8 ++ GST xv, , Y0, Y, 0, Y1, Y, 0x20 ++.endm ++ ++.macro DSTORE_Y_4 ++ GST xv, , Y0, Y, 0 ++.endm ++ ++.macro DSTORE_Y_1 ++ fst.d $f10, Y, 0 ++.endm ++ ++// Unable to use vector load/store ins ++.macro DLOAD_Y_8_GAP ++ fld.d $f10, Y, 0 ++ fldx.d $f13, Y, INC_Y ++ PTR_ALSL T0, INC_Y, Y, 1 ++ fld.d $f14, T0, 0 ++ fldx.d $f15, T0, INC_Y ++ PTR_ALSL T0, INC_Y, Y, 2 ++ fld.d $f11, T0, 0 ++ fldx.d $f17, T0, INC_Y ++ PTR_ADD T0, T0, INC_Y ++ PTR_ADD T0, T0, INC_Y ++ fld.d $f18, T0, 0 ++ fldx.d $f19, T0, INC_Y ++ GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 ++.endm ++ ++.macro DLOAD_Y_4_GAP ++ fld.d $f10, Y, 0 ++ fldx.d $f13, Y, INC_Y ++ PTR_ALSL T0, INC_Y, Y, 1 ++ fld.d $f14, T0, 0 ++ fldx.d $f15, T0, INC_Y ++ GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 ++.endm ++ ++.macro DSTORE_Y_8_GAP ++ xvstelm.d Y0, Y, 0, 0 ++ PTR_ADD T0, Y, INC_Y ++ xvstelm.d Y0, T0, 0, 1 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y0, T0, 0, 2 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y0, T0, 0, 3 ++ ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y1, T0, 0, 0 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y1, T0, 0, 1 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y1, T0, 0, 2 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y1, T0, 0, 3 ++.endm ++ ++.macro DSTORE_Y_4_GAP ++ xvstelm.d Y0, Y, 0, 0 ++ PTR_ADD T0, Y, INC_Y ++ xvstelm.d Y0, T0, 0, 1 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y0, T0, 0, 2 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.d Y0, T0, 0, 3 ++.endm ++ ++.macro DLOAD_X_8_GAP ++ xvldrepl.d X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.d X1, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X2, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X3, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X4, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X5, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X6, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X7, T0, 0x00 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ ++ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA ++.endm ++ ++.macro DLOAD_X_4_GAP ++ xvldrepl.d X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.d X1, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X2, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.d X3, T0, 0x00 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA ++.endm ++ ++.macro DLOAD_X_2_GAP ++ xvldrepl.d X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.d X1, T0, 0x00 ++ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA ++.endm ++ ++.macro DGEMV_N_8x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA0, 0, \ ++ A2, PA1, 0, A3, PA1, 0, \ ++ A4, PA2, 0, A5, PA2, 0, \ ++ A6, PA3, 0, A7, PA3, 0, \ ++ A8, PA4, 0, A9, PA4, 0, \ ++ A10, PA5, 0, A11, PA5, 0, \ ++ A12, PA6, 0, A13, PA6, 0, \ ++ A14, PA7, 0, A15, PA7, 0 ++ ++ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ ++ Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ ++ Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ ++ Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ ++ Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ ++ Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ ++ Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ ++ Y0, A14, X7, Y0, Y1, A15, X7, Y1 ++.endm ++ ++.macro DGEMV_N_4x8 ++ GLD_INC xv, , 0x20, A0, PA0, 0, \ ++ A2, PA1, 0, \ ++ A4, PA2, 0, \ ++ A6, PA3, 0, \ ++ A8, PA4, 0, \ ++ A10, PA5, 0, \ ++ A12, PA6, 0, \ ++ A14, PA7, 0 ++ ++ GMADD xvf, d, Y0, A0, X0, Y0, \ ++ Y0, A2, X1, Y0, \ ++ Y0, A4, X2, Y0, \ ++ Y0, A6, X3, Y0, \ ++ Y0, A8, X4, Y0, \ ++ Y0, A10, X5, Y0, \ ++ Y0, A12, X6, Y0, \ ++ Y0, A14, X7, Y0 ++.endm ++ ++.macro DGEMV_N_1x8 ++ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ ++ $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 ++ GMADD f, d, $f10, $f12, $f2, $f10, \ ++ $f10, $f14, $f3, $f10, \ ++ $f10, $f16, $f4, $f10, \ ++ $f10, $f18, $f5, $f10, \ ++ $f10, $f20, $f6, $f10, \ ++ $f10, $f22, $f7, $f10, \ ++ $f10, $f24, $f8, $f10, \ ++ $f10, $f26, $f9, $f10, ++.endm ++ ++.macro DGEMV_N_8x4 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA0, 0, \ ++ A2, PA1, 0, A3, PA1, 0, \ ++ A4, PA2, 0, A5, PA2, 0, \ ++ A6, PA3, 0, A7, PA3, 0 ++ ++ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ ++ Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ ++ Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ ++ Y0, A6, X3, Y0, Y1, A7, X3, Y1 ++.endm ++ ++.macro DGEMV_N_4x4 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 ++ ++ GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ ++ Y0, A4, X2, Y0, Y0, A6, X3, Y0 ++.endm ++ ++.macro DGEMV_N_1x4 ++ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 ++ GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ ++ $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 ++.endm ++ ++.macro DGEMV_N_8x2 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA0, 0, \ ++ A2, PA1, 0, A3, PA1, 0 ++ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ ++ Y0, A2, X1, Y0, Y1, A3, X1, Y1 ++.endm ++ ++.macro DGEMV_N_4x2 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 ++ GMADD xvf, d, Y0, A0, X0, Y0, \ ++ Y0, A2, X1, Y0 ++.endm ++ ++.macro DGEMV_N_1x2 ++ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 ++ GMADD f, d, $f10, $f12, $f2, $f10, \ ++ $f10, $f14, $f3, $f10 ++.endm ++ ++.macro DGEMV_N_1x1 ++ fld.d $f12, PA0, 0 ++ PTR_ADDI PA0, PA0, 0x08 ++ fmadd.d $f10, $f12, $f2, $f10 ++.endm ++ ++.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req ++ PTR_SRLI J, N, 3 ++ beqz J, .L_\XW\()_N_7 ++ PTR_SLLI K_LDA, LDA, 3 ++ PTR_SUB K_LDA, K_LDA, M8 ++.L_\XW\()_N_L8: ++ DLOAD_\X_8 ++ xor K, K, K ++ move Y, Y_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_M_7 ++.align 5 ++.L_\XW\()_M_L8: ++ DLOAD_\Y_8 ++ DGEMV_N_8x8 ++ DSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ PTR_ADDI K, K, 8 ++ bnez I, .L_\XW\()_M_L8 ++.L_\XW\()_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_M_3 ++ DLOAD_\Y_4 ++ DGEMV_N_4x8 ++ DSTORE_\Y_4 ++ PTR_ALSL Y, INC_Y, Y, 2 ++ PTR_ADDI K, K, 4 ++.L_\XW\()_M_3: ++ andi I, M, 3 ++ beqz I, .L_\XW\()_M_END ++.align 5 ++.L_\XW\()_M_L1: ++ DLOAD_\Y_1 ++ DGEMV_N_1x8 ++ DSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_M_L1 ++.L_\XW\()_M_END: ++ PTR_ADDI J, J, -1 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#endif ++ PTR_ALSL X, INC_X, X, 3 ++ bnez J, .L_\XW\()_N_L8 ++.L_\XW\()_N_7: ++ andi J, N, 4 ++ beqz J, .L_\XW\()_N_3 ++ DLOAD_\X_4 ++ xor K, K, K ++ move Y, Y_ORG ++ ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_7 ++.align 5 ++.L_\XW\()_N_4_M_L8: ++ DLOAD_\Y_8 ++ DGEMV_N_8x4 ++ DSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI K, K, 8 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez I, .L_\XW\()_N_4_M_L8 ++.L_\XW\()_N_4_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_N_4_M_3 ++ DLOAD_\Y_4 ++ DGEMV_N_4x4 ++ DSTORE_\Y_4 ++ PTR_ALSL Y, INC_Y, Y, 2 ++ PTR_ADDI K, K, 4 ++.L_\XW\()_N_4_M_3: ++ andi I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_END ++.align 5 ++.L_\XW\()_N_4_M_L1: ++ DLOAD_\Y_1 ++ DGEMV_N_1x4 ++ DSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_4_M_L1 ++.L_\XW\()_N_4_M_END: ++ PTR_SLLI K_LDA, LDA, 2 ++ PTR_SUB K_LDA, K_LDA, M8 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#endif ++ PTR_ALSL X, INC_X, X, 2 ++.L_\XW\()_N_3: ++ andi J, N, 2 ++ beqz J, .L_\XW\()_N_1 ++ DLOAD_\X_2 ++ xor K, K, K ++ move Y, Y_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_7 ++.align 5 ++.L_\XW\()_N_2_M_L8: ++ DLOAD_\Y_8 ++ DGEMV_N_8x2 ++ DSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI K, K, 8 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez I, .L_\XW\()_N_2_M_L8 ++.L_\XW\()_N_2_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_N_2_M_3 ++ DLOAD_\Y_4 ++ DGEMV_N_4x2 ++ DSTORE_\Y_4 ++ PTR_ALSL Y, INC_Y, Y, 2 ++ PTR_ADDI K, K, 4 ++.L_\XW\()_N_2_M_3: ++ andi I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_END ++.align 5 ++.L_\XW\()_N_2_M_L1: ++ DLOAD_\Y_1 ++ DGEMV_N_1x2 ++ DSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_2_M_L1 ++.L_\XW\()_N_2_M_END: ++ PTR_SLLI K_LDA, LDA, 1 ++ PTR_SUB K_LDA, K_LDA, M8 ++ PTR_ADD PA0, PA0, K_LDA ++ PTR_ADD PA1, PA1, K_LDA ++ PTR_ALSL X, INC_X, X, 1 ++.L_\XW\()_N_1: ++ andi J, N, 1 ++ beqz J, .L_END ++ DLOAD_\X_1 ++ xor K, K, K ++ move Y, Y_ORG ++ move I, M ++ beqz I, .L_END ++.align 5 ++.L_\XW\()_N_1_M_L1: ++ DLOAD_\Y_1 ++ DGEMV_N_1x1 ++ DSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_1_M_L1 ++ b .L_END ++.endm ++ ++ PROLOGUE ++ PTR_LD INC_Y, $sp, 0 ++ push_if_used 17 + 7, 24 + 4 ++ PTR_ADDI K, $r0, 0x01 ++ PTR_SUB I, INC_X, K ++ PTR_SUB J, INC_Y, K ++ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ ++ maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ ++ PTR_ALSL I, I, J, 1 ++ GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 ++ xvreplve0.d VALPHA, $xr0 ++ move Y_ORG, Y ++ move PA0, A ++#if __loongarch_grlen == 64 ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#else ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#endif ++ la.local T0, .L_GAP_TABLE ++ PTR_ALSL I, I, T0, 1 ++ ld.h K, I, 0 ++ PTR_ADD T0, T0, K ++ jirl $r0, T0, 0 ++.L_GAP_TABLE: ++ .hword .L_GAP_0_0 - .L_GAP_TABLE ++ .hword .L_GAP_0_1 - .L_GAP_TABLE ++ .hword .L_GAP_1_0 - .L_GAP_TABLE ++ .hword .L_GAP_1_1 - .L_GAP_TABLE ++.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ ++ DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 ++.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ ++ DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 ++.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ ++ DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 ++.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ++ DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 ++.L_END: ++ pop_if_used 17 + 7, 24 + 4 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S +new file mode 100644 +index 000000000..71f942b0f +--- /dev/null ++++ b/kernel/loongarch64/dgemv_t_8_lasx.S +@@ -0,0 +1,481 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/07/17 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, ++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) ++ */ ++#define M $r4 ++#define N $r5 ++#define ALPHA $f0 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INC_X $r10 ++#define Y $r11 ++#define INC_Y $r6 ++ ++#define J $r12 ++#define I $r13 ++#define K $r14 ++#define PY0 $r14 ++#define X_ORG $r15 ++#define PY1 $r16 ++#define K_LDA $r17 ++#define PY2 $r18 ++#define T0 $r19 ++#define PA0 $r20 ++#define PA1 $r23 ++#define PA2 $r24 ++#define PA3 $r25 ++#define PA4 $r26 ++#define PA5 $r27 ++#define PA6 $r28 ++#define PA7 $r29 ++#define M8 $r30 ++ ++#define VALPHA $xr0 ++#define X0 $xr1 ++#define X1 $xr2 ++#define A0 $xr3 ++#define A1 $xr4 ++#define A2 $xr5 ++#define A3 $xr6 ++#define A4 $xr7 ++#define A5 $xr8 ++#define A6 $xr9 ++#define A7 $xr10 ++#define A8 $xr11 ++#define A9 $xr12 ++#define A10 $xr13 ++#define A11 $xr14 ++#define A12 $xr15 ++#define A13 $xr16 ++#define A14 $xr17 ++#define A15 $xr18 ++#define TP0 $xr19 ++#define TP1 $xr20 ++#define TP2 $xr21 ++#define TP3 $xr22 ++#define TP4 $xr23 ++#define TP5 $xr24 ++#define TP6 $xr25 ++#define TP7 $xr26 ++#define Y0 $xr3 ++#define Y1 $xr4 ++#define Y2 $xr5 ++#define Y3 $xr6 ++#define Y4 $xr7 ++#define Y5 $xr8 ++#define Y6 $xr9 ++#define Y7 $xr10 ++ ++.macro ZERO_Y8 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ ++ TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 ++.endm ++ ++.macro ZERO_Y4 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 ++.endm ++ ++.macro ZERO_Y2 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 ++.endm ++ ++.macro ZERO_Y1 ++ GXOR xv, v, TP0, TP0, TP0 ++.endm ++ ++.macro DLOAD_X8 ++ GLD xv, , X0, X, 0x00, X1, X, 0x20 ++.endm ++ ++.macro DLOAD_X4 ++ GLD xv, , X0, X, 0x00 ++.endm ++ ++.macro DLOAD_X8_GAP ++ fld.d $f1, X, 0x00 ++ fldx.d $f2, X, INC_X ++ PTR_ALSL T0, INC_X, X, 1 ++ fld.d $f3, T0, 0x00 ++ fldx.d $f4, T0, INC_X ++ GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 ++ PTR_ALSL T0, INC_X, X, 2 ++ fld.d $f2, T0, 0x00 ++ fldx.d $f3, T0, INC_X ++ PTR_ALSL T0, INC_X, T0, 1 ++ fld.d $f4, T0, 0x00 ++ fldx.d $f5, T0, INC_X ++ GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 ++.endm ++ ++.macro DLOAD_X4_GAP ++ fld.d $f1, X, 0x00 ++ fldx.d $f2, X, INC_X ++ PTR_ALSL T0, INC_X, X, 1 ++ fld.d $f3, T0, 0x00 ++ fldx.d $f4, T0, INC_X ++ GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 ++.endm ++ ++.macro DGEMV_T_8x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA0, 0, \ ++ A2, PA1, 0, A3, PA1, 0, \ ++ A4, PA2, 0, A5, PA2, 0, \ ++ A6, PA3, 0, A7, PA3, 0, \ ++ A8, PA4, 0, A9, PA4, 0, \ ++ A10, PA5, 0, A11, PA5, 0, \ ++ A12, PA6, 0, A13, PA6, 0, \ ++ A14, PA7, 0, A15, PA7, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ ++ TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ ++ TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ ++ TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ ++ TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ ++ TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ ++ TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ ++ TP7, A14, X0, TP7, TP7, A15, X1, TP7 ++.endm ++ ++.macro DGEMV_T_8x4 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ ++ A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ ++ TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ ++ TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ ++ TP6, A12, X0, TP6, TP7, A14, X0, TP7, ++.endm ++ ++.macro DGEMV_T_4x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA0, 0, \ ++ A2, PA1, 0, A3, PA1, 0, \ ++ A4, PA2, 0, A5, PA2, 0, \ ++ A6, PA3, 0, A7, PA3, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ ++ TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ ++ TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ ++ TP3, A6, X0, TP3, TP3, A7, X1, TP3 ++.endm ++ ++.macro DGEMV_T_4x4 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ ++ TP2, A4, X0, TP2, TP3, A6, X0, TP3 ++.endm ++ ++.macro DGEMV_T_2x8 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ ++ TP1, A2, X0, TP1, TP1, A3, X1, TP1 ++.endm ++ ++.macro DGEMV_T_2x4 ++ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 ++ ++ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 ++.endm ++ ++.macro DGEMV_T_LASX XW:req X8:req, X4:req ++ PTR_SRLI J, N, 3 ++ beqz J, .L_\XW\()_N_7 ++ PTR_SLLI K_LDA, LDA, 3 ++ PTR_SUB K_LDA, K_LDA, M8 ++.L_\XW\()_N_L8: ++ ZERO_Y8 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_M_7 ++.align 5 ++.L_\XW\()_M_L8: ++ DLOAD_\X8 ++ DGEMV_T_8x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_M_L8 ++.L_\XW\()_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_M_3 ++ DLOAD_\X4 ++ DGEMV_T_8x4 ++ PTR_ALSL X, INC_X, X, 2 ++.L_\XW\()_M_3: ++ // Accumulated ++ GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ ++ Y5, TP5, Y6, TP6, Y7, TP7 ++ andi I, M, 3 ++ beqz I, .L_\XW\()_M_END ++.align 5 ++.L_\XW\()_M_L1: ++ fld.d $f1, X, 0x00 ++ fld.d $f11, PA0, 0x00 ++ fld.d $f12, PA1, 0x00 ++ fld.d $f13, PA2, 0x00 ++ fld.d $f14, PA3, 0x00 ++ fld.d $f15, PA4, 0x00 ++ fld.d $f16, PA5, 0x00 ++ fld.d $f17, PA6, 0x00 ++ fld.d $f18, PA7, 0x00 ++#if __loongarch_grlen == 64 ++ GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ ++ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 ++#elif __loongarch_grlen == 32 ++ GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ ++ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 ++#else ++ GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ ++ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 ++#endif ++ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ ++ $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_M_L1 ++.L_\XW\()_M_END: ++ fld.d $f11, Y, 0x00 ++ fldx.d $f12, Y, INC_Y ++ PTR_ALSL PY0, INC_Y, Y, 1 ++ fld.d $f13, PY0, 0x00 ++ fldx.d $f14, PY0, INC_Y ++ PTR_ALSL PY1, INC_Y, Y, 2 ++ fld.d $f15, PY1, 0x00 ++ fldx.d $f16, PY1, INC_Y ++ PTR_ALSL PY2, INC_Y, PY1, 1 ++ fld.d $f17, PY2, 0x00 ++ fldx.d $f18, PY2, INC_Y ++ ++ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ ++ $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 ++ ++ PTR_ADDI J, J, -1 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#endif ++ fst.d $f11, Y, 0x00 ++ fstx.d $f12, Y, INC_Y ++ fst.d $f13, PY0, 0x00 ++ fstx.d $f14, PY0, INC_Y ++ fst.d $f15, PY1, 0x00 ++ fstx.d $f16, PY1, INC_Y ++ fst.d $f17, PY2, 0x00 ++ fstx.d $f18, PY2, INC_Y ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez J, .L_\XW\()_N_L8 ++.L_\XW\()_N_7: ++ andi J, N, 4 ++ beqz J, .L_\XW\()_N_3 ++ ZERO_Y4 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_7 ++.align 5 ++.L_\XW\()_N_4_M_L8: ++ DLOAD_\X8 ++ DGEMV_T_4x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_N_4_M_L8 ++.L_\XW\()_N_4_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_N_4_M_3 ++ DLOAD_\X4 ++ DGEMV_T_4x4 ++ PTR_ALSL X, INC_X, X, 2 ++.L_\XW\()_N_4_M_3: ++ // Accumulated ++ GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 ++ andi I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_END ++.align 5 ++.L_\XW\()_N_4_M_L1: ++ fld.d $f1, X, 0x00 ++ GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 ++ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_N_4_M_L1 ++.L_\XW\()_N_4_M_END: ++ fld.d $f11, Y, 0x00 ++ fldx.d $f12, Y, INC_Y ++ PTR_ALSL PY0, INC_Y, Y, 1 ++ fld.d $f13, PY0, 0x00 ++ fldx.d $f14, PY0, INC_Y ++ ++ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 ++ ++ PTR_SLLI K_LDA, LDA, 2 ++ PTR_SUB K_LDA, K_LDA, M8 ++ ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#endif ++ fst.d $f11, Y, 0x00 ++ fstx.d $f12, Y, INC_Y ++ fst.d $f13, PY0, 0x00 ++ fstx.d $f14, PY0, INC_Y ++ PTR_ALSL Y, INC_Y, Y, 2 ++.L_\XW\()_N_3: ++ andi J, N, 2 ++ beqz J, .L_\XW\()_N_1 ++ ZERO_Y2 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_7 ++.align 5 ++.L_\XW\()_N_2_M_L8: ++ DLOAD_\X8 ++ DGEMV_T_2x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_N_2_M_L8 ++.L_\XW\()_N_2_M_7: ++ andi I, M, 4 ++ beqz I, .L_\XW\()_N_2_M_3 ++ DLOAD_\X4 ++ DGEMV_T_2x4 ++ PTR_ALSL X, INC_X, X, 2 ++.L_\XW\()_N_2_M_3: ++ // Accumulated ++ GACC xvf, d, Y0, TP0, Y1, TP1 ++ andi I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_END ++.align 5 ++.L_\XW\()_N_2_M_L1: ++ fld.d $f1, X, 0x00 ++ GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 ++ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_N_2_M_L1 ++.L_\XW\()_N_2_M_END: ++ fld.d $f11, Y, 0x00 ++ fldx.d $f12, Y, INC_Y ++ ++ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 ++ ++ PTR_SLLI K_LDA, LDA, 1 ++ PTR_SUB K_LDA, K_LDA, M8 ++ ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#endif ++ fst.d $f11, Y, 0x00 ++ fstx.d $f12, Y, INC_Y ++ PTR_ALSL Y, INC_Y, Y, 1 ++.L_\XW\()_N_1: ++ andi J, N, 1 ++ beqz J, .L_END ++ ZERO_Y1 ++ move X, X_ORG ++ move I, M ++ beqz I, .L_END ++.align 5 ++.L_\XW\()_N_1_M_L1: ++ fld.d $f3, PA0, 0x00 ++ fld.d $f1, X, 0x00 ++ fmadd.d $f19, $f3, $f1, $f19 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ PTR_ADDI PA0, PA0, 0x08 ++ bnez I, .L_\XW\()_N_1_M_L1 ++ fld.d $f3, Y, 0x00 ++ fmadd.d $f3, ALPHA, $f19, $f3 ++ fst.d $f3, Y, 0x00 ++ b .L_END ++.endm ++ ++ PROLOGUE ++ PTR_LD INC_Y, $sp, 0 ++ push_if_used 17 + 8, 24 + 3 ++ PTR_ADDI K, $r0, 0x01 ++ PTR_SUB I, INC_X, K ++ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ ++ GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 ++ xvreplve0.d VALPHA, $xr0 ++ move X_ORG, X ++ move PA0, A ++#if __loongarch_grlen == 64 ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#else ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#endif ++ la.local T0, .L_GAP_TABLE ++ PTR_ALSL I, I, T0, 1 ++ ld.h K, I, 0 ++ PTR_ADD T0, T0, K ++ jirl $r0, T0, 0 ++.L_GAP_TABLE: ++ .hword .L_GAP_0 - .L_GAP_TABLE ++ .hword .L_GAP_1 - .L_GAP_TABLE ++.L_GAP_0: /* if (incx == 1) */ ++ DGEMV_T_LASX GAP_0, X8, X4 ++.L_GAP_1: /* if (incx != 1) */ ++ DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP ++.L_END: ++ pop_if_used 17 + 8, 24 + 3 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S +new file mode 100644 +index 000000000..3315daccb +--- /dev/null ++++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S +@@ -0,0 +1,1366 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/07/26 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, ++ * FLOAT *c, BLASLONG ldc, BLASLONG offset) ++ */ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++#define OFFSET $r11 // param 9: offset ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define T0 $r25 ++#define T1 $r26 ++#define T2 $r27 ++#define KK $r28 ++#define AA $r29 ++#define CC $r30 ++#undef ZERO ++#define ZERO $r0 ++ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++ ++/* Prefetch interval */ ++#define A_PRE 0x400 ++#define B_PRE 0x100 ++ ++#include "dtrsm_kernel_macro.S" ++ ++// By integrating the dgemm and dsolve processes, the following advantages can be obtained: ++// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel) ++// 2. Reduce the storage and retrieval of C data ++// 3. Vectorization of dsolve ++// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size. ++// To achieve finer-grained optimization, 15 scenarios have been addressed: ++// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1. ++ ++.macro dsolve_16 N ++// if N = 4 the data layout of C is as follows: ++// U0 U1 U2 U3 ++// U4 U5 U6 U7 ++// U8 U9 U10 U11 ++// U12 U13 U14 U15 ++// if N = 2 the dat layout of C is as follows: ++// U0 U1 U2 U3 ++// U4 U5 U6 U7 ++// if N = 1 the dat layout of C is as follows: ++// U0 U1 U2 U3 ++// The matrix A has dimensions of 16x16, and ++// it will be divided into 4 segments for processing. ++ ++#define G12 U3 ++#define G13 U7 ++#define G14 U11 ++#define G15 U15 ++ GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 ++ // A ++ // G12 G13 G14 G15 ++ // ----------------- ++ // 204 | D9 ++ // 220 221 | D8 D7 ++ // 236 237 238 | D6 D5 D4 ++ // 252 253 254 255 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 252 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 236 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 220 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 204 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ ++ xvfmul.d G15, G15, D0 ++ GNMSUB xvf, d, G14, G15, D1, G14 ++ xvfmul.d G14, G14, D4 ++ GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13 ++ xvfmul.d G13, G13, D7 ++ GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12 ++ xvfmul.d G12, G12, D9 ++ // Store B ++.if \N == 4 ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // b48 b49 b50 b51 ... b60 b61 b62 b63 ++ GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8 ++.elseif \N == 2 ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // b24 b25 b26 b27 b28 b29 b30 b31 ++ GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8 ++.elseif \N == 1 ++ // x x x x ++ // x x x x ++ // x x x x ++ // b12 b13 b14 b15 ++ GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8 ++.endif ++ // Transpose G15 G14 G13 G12 ++ GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ // x x x x ... c12 c13 c14 c15 ++ // x x x x ... c28 c29 c30 c31 ++ // x x x x ... c44 c45 c46 c47 ++ // x x x x ... c60 c61 c62 c63 ++ GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8 ++.elseif \N == 2 ++ // x x x x ... c12 c13 c14 c15 ++ // x x x x ... c28 c29 c30 c31 ++ GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8 ++.elseif \N == 1 ++ // Store C ++ // x x x x ... c12 c13 c14 c15 ++ GST xv, , D0, C0, 12 * 8 ++.endif ++ ++#define G8 U2 ++#define G9 U6 ++#define G10 U10 ++#define G11 U14 ++ GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 ++ // A ++ // G8 G9 G10 G11 ++ // ----------------- ++ // 136 | D9 ++ // 152 153 | D8 D7 ++ // 168 169 170 | D6 D5 D4 ++ // 184 185 186 187 | D3 D2 D1 D0 ++ // 200 201 202 203 | D15 D14 D13 D12 ++ // 216 217 218 219 | D11 D10 D9 D8 ++ // 232 233 234 235 | D7 D6 D5 D4 ++ // 248 249 250 251 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 248 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 232 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 216 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 200 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G11, G15, D0, G11, G10, G15, D1, G10, G9, G15, D2, G9, G8, G15, D3, G8, \ ++ G11, G14, D4, G11, G10, G14, D5, G10, G9, G14, D6, G9, G8, G14, D7, G8, \ ++ G11, G13, D8, G11, G10, G13, D9, G10, G9, G13, D10, G9, G8, G13, D11, G8, \ ++ G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8 ++ PTR_ADDI T0, A0, 184 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 168 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 152 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 136 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ ++ xvfmul.d G11, G11, D0 ++ GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8 ++ xvfmul.d G10, G10, D4 ++ GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8 ++ xvfmul.d G9, G9, D7 ++ GNMSUB xvf, d, G8, G9, D8, G8 ++ xvfmul.d G8, G8, D9 ++ // Store B ++.if \N == 4 ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // b32 b33 b34 b34 ... b44 b45 b46 b47 ++ // b48 b49 b50 b51 ... b60 b61 b62 b63 ++ GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8 ++.elseif \N == 2 ++ // x x x x ... x x x x ++ // x x x x ... x x x x ++ // b16 b17 b18 b19 b20 b21 b22 b23 ++ // b24 b25 b26 b27 b28 b29 b30 b31 ++ GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8 ++.elseif \N == 1 ++ // x x x x ++ // x x x x ++ // b8 b9 b10 b11 ++ // b12 b13 b14 b15 ++ GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8 ++.endif ++ // Transpose G11 G10 G9 G8 ++ GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 ++ // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 ++ // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47 ++ // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63 ++ GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8 ++.elseif \N == 2 ++ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 ++ // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 ++ GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8 ++.elseif \N == 1 ++ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 ++ GST xv, , D0, C0, 8 * 8 ++.endif ++ ++#define G4 U1 ++#define G5 U5 ++#define G6 U9 ++#define G7 U13 ++ GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1 ++ // A ++ // G4 G5 G6 G7 ++ // ------------------ ++ // 68 | D9 ++ // 84 85 | D8 D7 ++ // 100 101 102 | D6 D5 D4 ++ // 116 117 118 119 | D3 D2 D1 D0 ++ // 132 133 134 135 | D15 D14 D13 D12 ++ // 148 149 150 151 | D11 D10 D9 D8 ++ // 164 165 166 167 | D7 D6 D5 D4 ++ // 180 181 182 183 | D3 D2 D1 D0 ++ // 196 197 198 199 | D15 D14 D13 D12 ++ // 212 213 214 215 | D11 D10 D9 D8 ++ // 228 229 230 231 | D7 D6 D5 D4 ++ // 244 245 246 247 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 244 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 228 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 212 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 196 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G7, G15, D0, G7, G6, G15, D1, G6, G5, G15, D2, G5, G4, G15, D3, G4, \ ++ G7, G14, D4, G7, G6, G14, D5, G6, G5, G14, D6, G5, G4, G14, D7, G4, \ ++ G7, G13, D8, G7, G6, G13, D9, G6, G5, G13, D10, G5, G4, G13, D11, G4, \ ++ G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4 ++ PTR_ADDI T0, A0, 180 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 164 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 148 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 132 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G7, G11, D0, G7, G6, G11, D1, G6, G5, G11, D2, G5, G4, G11, D3, G4, \ ++ G7, G10, D4, G7, G6, G10, D5, G6, G5, G10, D6, G5, G4, G10, D7, G4, \ ++ G7, G9, D8, G7, G6, G9, D9, G6, G5, G9, D10, G5, G4, G9, D11, G4, \ ++ G7, G8, D12, G7, G6, G8, D13, G6, G5, G8, D14, G5, G4, G8, D15, G4 ++ PTR_ADDI T0, A0, 116 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 100 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 84 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 68 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ xvfmul.d G7, G7, D0 ++ GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 ++ xvfmul.d G6, G6, D4 ++ GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 ++ xvfmul.d G5, G5, D7 ++ GNMSUB xvf, d, G4, G5, D8, G4 ++ xvfmul.d G4, G4, D9 ++ // Store B ++.if \N == 4 ++ // x x x x ... x x x x ++ // b16 b17 b18 b19 ... b28 b29 b30 b31 ++ // b32 b33 b34 b34 ... b44 b45 b46 b47 ++ // b48 b49 b50 b51 ... b60 b61 b62 b63 ++ GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 ++.elseif \N == 2 ++ // x x x x ... x x x x ++ // b8 b9 b10 b11 b12 b13 b14 b15 ++ // b16 b17 b18 b19 b20 b21 b22 b23 ++ // b24 b25 b26 b27 b28 b29 b30 b31 ++ GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8 ++.elseif \N == 1 ++ // x x x x ++ // b4 b5 b6 b7 ++ // b8 b9 b10 b11 ++ // b12 b13 b14 b15 ++ GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8 ++.endif ++ // Transpose G7 G6 G5 G4 ++ GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 ++ // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 ++ // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47 ++ // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63 ++ GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8 ++.elseif \N == 2 ++ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 ++ // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 ++ GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8 ++.elseif \N == 1 ++ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 ++ GST xv, , D0, C0, 4 * 8 ++.endif ++ ++#define G0 U0 ++#define G1 U4 ++#define G2 U8 ++#define G3 U12 ++ GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1 ++ // A ++ // G0 G1 G2 G3 ++ // ------------------ ++ // 0 | D9 ++ // 16 17 | D8 D7 ++ // 32 33 34 | D6 D5 D4 ++ // 48 49 50 51 | D3 D2 D1 D0 ++ // 64 65 66 67 | D15 D14 D13 D12 ++ // 80 81 82 83 | D11 D10 D9 D8 ++ // 96 97 98 99 | D7 D6 D5 D4 ++ // 112 113 114 115 | D3 D2 D1 D0 ++ // 128 129 130 131 | D15 D14 D13 D12 ++ // 144 145 146 147 | D11 D10 D9 D8 ++ // 160 161 162 163 | D7 D6 D5 D4 ++ // 176 177 178 179 | D3 D2 D1 D0 ++ // 192 193 194 195 | D15 D14 D13 D12 ++ // 208 209 210 211 | D11 D10 D9 D8 ++ // 224 225 226 227 | D7 D6 D5 D4 ++ // 240 241 242 243 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 240 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 224 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 208 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 192 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G3, G15, D0, G3, G2, G15, D1, G2, G1, G15, D2, G1, G0, G15, D3, G0, \ ++ G3, G14, D4, G3, G2, G14, D5, G2, G1, G14, D6, G1, G0, G14, D7, G0, \ ++ G3, G13, D8, G3, G2, G13, D9, G2, G1, G13, D10, G1, G0, G13, D11, G0, \ ++ G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0 ++ PTR_ADDI T0, A0, 176 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 160 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 144 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 128 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G3, G11, D0, G3, G2, G11, D1, G2, G1, G11, D2, G1, G0, G11, D3, G0, \ ++ G3, G10, D4, G3, G2, G10, D5, G2, G1, G10, D6, G1, G0, G10, D7, G0, \ ++ G3, G9, D8, G3, G2, G9, D9, G2, G1, G9, D10, G1, G0, G9, D11, G0, \ ++ G3, G8, D12, G3, G2, G8, D13, G2, G1, G8, D14, G1, G0, G8, D15, G0 ++ PTR_ADDI T0, A0, 112 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 96 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 80 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 64 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ ++ G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ ++ G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ ++ G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 ++ PTR_ADDI T0, A0, 48 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 32 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 16 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 0 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ ++ xvfmul.d G3, G3, D0 ++ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 ++ xvfmul.d G2, G2, D4 ++ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 ++ xvfmul.d G1, G1, D7 ++ GNMSUB xvf, d, G0, G1, D8, G0 ++ xvfmul.d G0, G0, D9 ++ // Store B ++.if \N == 4 ++ // b0 b1 b2 b3 ... b12 b13 b14 b15 ++ // b16 b17 b18 b19 ... b28 b29 b30 b31 ++ // b32 b33 b34 b34 ... b44 b45 b46 b47 ++ // b48 b49 b50 b51 ... b60 b61 b62 b63 ++ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 ++.elseif \N == 2 ++ // b0 b1 b2 b3 b4 b5 b6 b7 ++ // b8 b9 b10 b11 b12 b13 b14 b15 ++ // b16 b17 b18 b19 b20 b21 b22 b23 ++ // b24 b25 b26 b27 b28 b29 b30 b31 ++ GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8 ++.elseif \N == 1 ++ // b0 b1 b2 b3 ++ // b4 b5 b6 b7 ++ // b8 b9 b10 b11 ++ // b12 b13 b14 b15 ++ GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8 ++.endif ++ // Transpose C3 C2 C1 C0 ++ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ // c0 c1 c2 c3 ... c12 c13 c14 c15 ++ // c16 c17 c18 c19 ... c28 c29 c30 c31 ++ // c32 c33 c34 c34 ... c44 c45 c46 c47 ++ // c48 c49 c50 c51 ... c60 c61 c62 c63 ++ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 ++.elseif \N == 2 ++ // c0 c1 c2 c3 ... c12 c13 c14 c15 ++ // c16 c17 c18 c19 ... c28 c29 c30 c31 ++ GST xv, , D0, C0, 0, D1, C1, 0 ++.elseif \N == 1 ++ // c0 c1 c2 c3 ... c12 c13 c14 c15 ++ GST xv, , D0, C0, 0 ++.endif ++ ++#undef G0 ++#undef G1 ++#undef G2 ++#undef G3 ++#undef G4 ++#undef G5 ++#undef G6 ++#undef G7 ++#undef G8 ++#undef G9 ++#undef G10 ++#undef G11 ++#undef G12 ++#undef G13 ++#undef G14 ++#undef G15 ++.endm ++ ++.macro dsolve_8 N ++// if N = 4 the data layout of C is as follows: ++// U0 U1 ++// U2 U3 ++// U4 U5 ++// U6 U7 ++// if N = 2 the dat layout of C is as follows: ++// U0 U1 ++// U2 U3 ++// if N = 1 the dat layout of C is as follows: ++// U0 U1 ++// The matrix A has dimensions of 8x8, and ++// it will be divided into 2 segments for processing. ++ ++#define G4 U1 ++#define G5 U3 ++#define G6 U5 ++#define G7 U7 ++ // Transpose U7 U5 U3 U1 ++ GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1 ++ // A ++ // G4 G5 G6 G7 ++ // --------------- ++ // 36 | D9 ++ // 44 45 | D8 D7 ++ // 52 53 54 | D6 D5 D4 ++ // 60 61 62 63 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 60 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 52 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 44 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 36 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ ++ xvfmul.d G7, G7, D0 ++ GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 ++ xvfmul.d G6, G6, D4 ++ GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 ++ xvfmul.d G5, G5, D7 ++ GNMSUB xvf, d, G4, G5, D8, G4 ++ xvfmul.d G4, G4, D9 ++ // Store B ++.if \N == 4 ++ GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 ++.elseif \N == 2 ++ GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8 ++.elseif \N == 1 ++ GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8 ++.endif ++ // Transpose ++ GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9 ++ // Store C ++.if \N == 4 ++ GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8 ++.elseif \N == 2 ++ GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8 ++.elseif \N == 1 ++ GST xv, , D4, C0, 4 * 8 ++.endif ++ ++#define G0 U0 ++#define G1 U2 ++#define G2 U4 ++#define G3 U6 ++ // Transpose U6 U4 U2 U0 ++ GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1 ++ // A ++ // G0 G1 G2 G3 ++ //----------------- ++ // 0 | D9 ++ // 8 9 | D8 D7 ++ // 16 17 18 | D6 D5 D4 ++ // 24 25 26 27 | D3 D2 D1 D0 ++ // 32 33 34 35 | D15 D14 D13 D12 ++ // 40 41 42 43 | D11 D10 D9 D8 ++ // 48 49 50 51 | D7 D6 D5 D4 ++ // 56 57 58 59 | D3 D2 D1 D0 ++ PTR_ADDI T0, A0, 56 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 48 * 8 ++ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 ++ PTR_ADDI T0, A0, 40 * 8 ++ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 ++ PTR_ADDI T0, A0, 32 * 8 ++ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 ++ GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ ++ G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ ++ G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ ++ G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 ++ PTR_ADDI T0, A0, 24 * 8 ++ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 ++ PTR_ADDI T0, A0, 16 * 8 ++ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 ++ PTR_ADDI T0, A0, 8 * 8 ++ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 ++ PTR_ADDI T0, A0, 0 * 8 ++ GLDREPL xv, d, D9, T0, 0 ++ ++ xvfmul.d G3, G3, D0 ++ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 ++ xvfmul.d G2, G2, D4 ++ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 ++ xvfmul.d G1, G1, D7 ++ GNMSUB xvf, d, G0, G1, D8, G0 ++ xvfmul.d G0, G0, D9 ++ // Store B ++.if \N == 4 ++ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 ++.elseif \N == 2 ++ GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8 ++.elseif \N == 1 ++ GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8 ++.endif ++ // Transpose ++ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 ++.elseif \N == 2 ++ GST xv, , D0, C0, 0, D1, C1, 0 ++.elseif \N == 1 ++ GST xv, , D0, C0, 0 ++.endif ++ ++#undef G0 ++#undef G1 ++#undef G2 ++#undef G3 ++#undef G4 ++#undef G5 ++#undef G6 ++#undef G7 ++.endm ++ ++.macro dsolve_4 N ++// if N = 4 the data layout of C is as follows: ++// U0 ++// U1 ++// U2 ++// U3 ++// if N = 2 the dat layout of C is as follows: ++// U0 ++// U1 ++// if N = 1 the dat layout of C is as follows: ++// U0 ++// The matrix A has dimensions of 4x4, and ++// it will be divided into 1 segments for processing. ++ ++#define G0 U0 ++#define G1 U1 ++#define G2 U2 ++#define G3 U3 ++ // Transpose U3 U2 U1 U0 ++ GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1 ++ // A ++ // G0 G1 G2 G3 ++ //------------- ++ // 0 | D9 ++ // 4 5 | D8 D7 ++ // 8 9 10 | D6 D5 D4 ++ // 12 13 14 15 | D3 D2 D1 D0 ++ GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \ ++ D6, A0, 8 * 8, D5, A0, 9 * 8, D4, A0, 10 * 8, \ ++ D8, A0, 4 * 8, D7, A0, 5 * 8, \ ++ D9, A0, 0 * 8 ++ xvfmul.d G3, G3, D0 ++ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 ++ xvfmul.d G2, G2, D4 ++ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 ++ xvfmul.d G1, G1, D7 ++ GNMSUB xvf, d, G0, G1, D8, G0 ++ xvfmul.d G0, G0, D9 ++ // Store B ++.if \N == 4 ++ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 ++.elseif \N == 2 ++ GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8 ++.elseif \N == 1 ++ GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8 ++.endif ++ // Transpose ++ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 ++ // Store C ++.if \N == 4 ++ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 ++.elseif \N == 2 ++ GST xv, , D0, C0, 0, D1, C1, 0 ++.elseif \N == 1 ++ GST xv, , D0, C0, 0 ++.endif ++ ++#undef G0 ++#undef G1 ++#undef G2 ++#undef G3 ++.endm ++ ++.macro dsolve_2 N ++#define G0 U2 ++#define G1 U3 ++ // Transpose ++ GSBUTTERFLY xv, d, G0, G1, U1, U0 ++ // A ++ // G0 G1 ++ // ------ ++ // 0 | D2 ++ // 2 3 | D1 D0 ++ GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8 ++ xvfmul.d G1, G1, D0 ++ GNMSUB xvf, d, G0, G1, D1, G0 ++ xvfmul.d G0, G0, D2 ++ // Store B ++.if \N == 4 ++ GST xv, , G0, B0, 0, G1, B0, 4 * 8 ++.elseif \N == 2 ++ GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8 ++.elseif \N == 1 ++ GST f, d, $f2, B0, 0, $f3, B0, 8 ++.endif ++ // Transpose ++ GSBUTTERFLY xv, d, D0, D1, G1, G0 ++ // Store C ++.if \N == 4 ++ vst $vr16, C0, 0x00 ++ vst $vr17, C1, 0x00 ++ xvstelm.d D0, C2, 0x00, 0x02 ++ xvstelm.d D1, C3, 0x00, 0x02 ++ xvstelm.d D0, C2, 0x08, 0x03 ++ xvstelm.d D1, C3, 0x08, 0x03 ++.elseif \N == 2 ++ GST v, , $vr16, C0, 0, $vr17, C1, 0 ++.elseif \N == 1 ++ GST v, , $vr16, C0, 0 ++.endif ++ ++#undef G0 ++#undef G1 ++.endm ++ ++.macro dgemm_dsolve_16x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x4_load ++ dgemm_16x4 ++ b .L_dsolve_16x4 ++.L_dsolve_16x4_load: ++ // Load C ++ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 ++ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++/********************** solver ******************/ ++.L_dsolve_16x4: ++ PTR_ADDI A0, T1, -(16 * 8 * 8) ++ PTR_ADDI A0, A0, -(16 * 8 * 8) ++ PTR_ADDI B0, T2, -(16 * 4 * 8) ++ dsolve_16 4 ++.endm ++ ++.macro dgemm_dsolve_1x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_1x4_load ++ dgemm_1x4 ++ b .L_dsolve_1x4 ++.L_dsolve_1x4_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ fld.d $f2, C2, 0x00 ++ fld.d $f3, C3, 0x00 ++ xvinsve0.d U0, U1, 0x01 ++ xvinsve0.d U0, U2, 0x02 ++ xvinsve0.d U0, U3, 0x03 ++.L_dsolve_1x4: ++ or A0, T1, T1 ++ or B0, T2, T2 ++ GLDREPL xv, d, D0, A0, -1 * 8 ++ GMUL xvf, d, U0, U0, D0 ++ // Store C ++ xvstelm.d U0, C0, 0x00, 0x00 ++ xvstelm.d U0, C1, 0x00, 0x01 ++ xvstelm.d U0, C2, 0x00, 0x02 ++ xvstelm.d U0, C3, 0x00, 0x03 ++ // Store B ++ xvst U0, B0, -32 ++.endm ++ ++.macro dgemm_dsolve_2x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x4_load ++ dgemm_2x4 ++ b .L_dsolve_2x4 ++.L_dsolve_2x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++ ++ xvpermi.q U0, U2, 0x02 ++ xvpermi.q U1, U3, 0x02 ++/********************** solver ******************/ ++.L_dsolve_2x4: ++ PTR_ADDI A0, T1, -(2 * 2 * 8) ++ PTR_ADDI B0, T2, -(2 * 4 * 8) ++ dsolve_2 4 ++.endm ++ ++.macro dgemm_dsolve_4x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x4_load ++ dgemm_4x4 ++ b .L_dsolve_4x4 ++.L_dsolve_4x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/************** solver *****************/ ++.L_dsolve_4x4: ++ PTR_ADDI A0, T1, -(4 * 4 * 8) ++ PTR_ADDI B0, T2, -(4 * 4 * 8) ++ ++ dsolve_4 4 ++.endm ++ ++.macro dgemm_dsolve_8x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x4_load ++ dgemm_8x4 ++ b .L_dsolve_8x4 ++.L_dsolve_8x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ ++ /* Load C2 */ ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ ++ /* Load C3 */ ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++/********* solver *********/ ++.L_dsolve_8x4: ++ PTR_ADDI A0, T1, -(8 * 8 * 8) ++ PTR_ADDI B0, T2, -(8 * 4 * 8) ++ ++ dsolve_8 4 ++.endm ++ ++.macro dgemm_dsolve_4x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x2_load ++ dgemm_4x2 ++ b .L_dsolve_4x2 ++.L_dsolve_4x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_4x2: ++ PTR_ADDI A0, T1, -(4 * 4 * 8) ++ PTR_ADDI B0, T2, -(4 * 2 * 8) ++ ++ dsolve_4 2 ++.endm ++ ++.macro dgemm_dsolve_2x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x2_load ++ dgemm_2x2 ++ b .L_dsolve_2x2 ++.L_dsolve_2x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_2x2: ++ PTR_ADDI A0, T1, -(2 * 2 * 8) ++ PTR_ADDI B0, T2, -(2 * 2 * 8) ++ ++ dsolve_2 2 ++.endm ++ ++.macro dgemm_dsolve_8x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x2_load ++ dgemm_8x2 ++ b .L_dsolve_8x2 ++.L_dsolve_8x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++.L_dsolve_8x2: ++ PTR_ADDI A0, T1, -(8 * 8 * 8) ++ PTR_ADDI B0, T2, -(8 * 2 * 8) ++ ++ dsolve_8 2 ++.endm ++ ++.macro dgemm_dsolve_16x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x2_load ++ dgemm_16x2 ++ b .L_dsolve_16x2 ++.L_dsolve_16x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ /* Load C1 */ ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++.L_dsolve_16x2: ++ PTR_ADDI A0, T1, -(16 * 8 * 8) ++ PTR_ADDI A0, A0, -(16 * 8 * 8) ++ PTR_ADDI B0, T2, -(16 * 2 * 8) ++ ++ dsolve_16 2 ++.endm ++ ++.macro dgemm_dsolve_2x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x1_load ++ dgemm_2x1 ++ b .L_dsolve_2x1 ++.L_dsolve_2x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_2x1: ++ PTR_ADDI A0, T1, -(2 * 2 * 8) ++ PTR_ADDI B0, T2, -(2 * 1 * 8) ++ ++ dsolve_2 1 ++.endm ++ ++.macro dgemm_dsolve_4x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x1_load ++ dgemm_4x1 ++ b .L_dsolve_4x1 ++.L_dsolve_4x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_4x1: ++ PTR_ADDI A0, T1, -(4 * 4 * 8) ++ PTR_ADDI B0, T2, -(4 * 1 * 8) ++ ++ dsolve_4 1 ++.endm ++ ++.macro dgemm_dsolve_8x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x1_load ++ dgemm_8x1 ++ b .L_dsolve_8x1 ++.L_dsolve_8x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++.L_dsolve_8x1: ++ PTR_ADDI A0, T1, -(8 * 8 * 8) ++ PTR_ADDI B0, T2, -(8 * 1 * 8) ++ ++ dsolve_8 1 ++.endm ++ ++.macro dgemm_dsolve_16x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x1_load ++ dgemm_16x1 ++ b .L_dsolve_16x1 ++.L_dsolve_16x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++.L_dsolve_16x1: ++ PTR_ADDI A0, T1, -(16 * 8 * 8) ++ PTR_ADDI A0, A0, -(16 * 8 * 8) ++ PTR_ADDI B0, T2, -(16 * 1 * 8) ++ ++ dsolve_16 1 ++.endm ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ PTR_SLLI LDC, LDC, 3 ++ /* if (!(N >> 2)) goto L_N3 */ ++ PTR_SRAI J, N, 2 /* J = bn >> 2 */ ++ andi N, N, 0x03 ++ beq ZERO, J, .L_N3 ++.align 5 ++.L_J1: ++ PTR_ADDI J, J, -1 ++ PTR_ADD KK, M, OFFSET ++ ++ andi I, M, 15 ++ beq ZERO, I, .L_M16 ++ andi I, M, 1 ++ beqz I, .L_M2 ++.L_M1: ++ PTR_ADDI T0, M, -1 ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ ++ PTR_ADD CC, T0, C /* c + (m - 1) */ ++ ++ PTR_SLLI T0, KK, 5 ++ PTR_ADD B0, B, T0 /* b + 4 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ dgemm_dsolve_1x4 ++ PTR_ADDI KK, KK, -1 ++.L_M2: ++ andi I, M, 2 ++ beqz I, .L_M4 ++ PTR_SRLI T0, M, 1 ++ PTR_SLLI T0, T0, 1 ++ PTR_ADDI T0, T0, -2 ++ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ ++ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ ++ PTR_SLLI T1, KK, 4 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ ++ PTR_SLLI T0, KK, 5 ++ PTR_ADD B0, B, T0 /* b + 4 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ dgemm_dsolve_2x4 ++ PTR_ADDI KK, KK, -2 ++.L_M4: ++ andi I, M, 4 ++ beqz I, .L_M8 ++ PTR_SRLI T0, M, 2 ++ PTR_SLLI T0, T0, 2 ++ PTR_ADDI T0, T0, -4 ++ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ ++ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ ++ PTR_SLLI T1, KK, 5 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ ++ PTR_SLLI T0, KK, 5 ++ PTR_ADD B0, B, T0 /* b + 4 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ dgemm_dsolve_4x4 ++ PTR_ADDI KK, KK, -4 ++.L_M8: ++ andi I, M, 8 ++ beqz I, .L_M16 ++ PTR_SRLI T0, M, 3 ++ PTR_SLLI T0, T0, 3 ++ PTR_ADDI T0, T0, -8 ++ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ ++ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ ++ PTR_SLLI T1, KK, 6 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ ++ PTR_SLLI T0, KK, 5 ++ PTR_ADD B0, B, T0 /* b + 4 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ dgemm_dsolve_8x4 ++ PTR_ADDI KK, KK, -8 ++.L_M16: ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_M0 ++ ++ PTR_SRLI T0, M, 4 ++ PTR_SLLI T0, T0, 4 ++ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, A, AA ++ PTR_ADD CC, C, T0 ++.align 5 ++.L_I1: ++ PTR_SLLI T0, KK, 5 ++ PTR_ADD B0, B, T0 ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_SLLI T0, KK, 7 ++ PTR_ADD A0, AA, T0 ++ dgemm_dsolve_16x4 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI KK, KK, -16 ++ PTR_ADDI CC, CC, -(16 * 8) ++ PTR_SLLI T0, K, 7 ++ PTR_SUB AA, AA, T0 ++ blt ZERO, I, .L_I1 ++.L_M0: ++ PTR_SLLI T0, K, 3 ++ PTR_ALSL B, T0, B, 2 // b += 4 * k; ++ PTR_ALSL C, LDC, C, 2 // c += 4 * ldc ++ blt ZERO, J, .L_J1 ++.L_N3: ++ andi J, N, 2 ++ beq ZERO, J, .L_N1 ++ ++ PTR_ADD KK, M, OFFSET ++ andi I, M, 15 ++ beq ZERO, I, .L_N3_M16 ++ andi I, M, 1 ++ beqz I, .L_N3_M2 ++.L_N3_M1: ++ PTR_ADDI KK, KK, -1 ++ ++ PTR_ADDI T0, M, -1 ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ ++ PTR_ADD CC, T0, C /* c + (m - 1) */ ++ ++ PTR_SLLI T0, KK, 4 ++ PTR_ADD B0, B, T0 /* b + 2 * kk */ ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ // dgemm_dsolve_1x2 ++ GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0 ++ GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0 ++ GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8 ++.L_N3_M2: ++ andi I, M, 2 ++ beqz I, .L_N3_M4 ++ PTR_SRLI T0, M, 1 ++ PTR_SLLI T0, T0, 1 ++ PTR_ADDI T0, T0, -2 ++ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ ++ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ ++ PTR_SLLI T1, KK, 4 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ ++ PTR_SLLI T0, KK, 4 ++ PTR_ADD B0, B, T0 /* b + 2 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ dgemm_dsolve_2x2 ++ PTR_ADDI KK, KK, -2 ++.L_N3_M4: ++ andi I, M, 4 ++ beqz I, .L_N3_M8 ++ PTR_SRLI T0, M, 2 ++ PTR_SLLI T0, T0, 2 ++ PTR_ADDI T0, T0, -4 ++ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ ++ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ ++ PTR_SLLI T1, KK, 5 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ ++ PTR_SLLI T0, KK, 4 ++ PTR_ADD B0, B, T0 /* b + 2 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ dgemm_dsolve_4x2 ++ PTR_ADDI KK, KK, -4 ++.L_N3_M8: ++ andi I, M, 8 ++ beqz I, .L_N3_M16 ++ PTR_SRLI T0, M, 3 ++ PTR_SLLI T0, T0, 3 ++ PTR_ADDI T0, T0, -8 ++ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ ++ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ ++ PTR_SLLI T1, KK, 6 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ ++ PTR_SLLI T0, KK, 4 ++ PTR_ADD B0, B, T0 /* b + 2 * kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ dgemm_dsolve_8x2 ++ PTR_ADDI KK, KK, -8 ++.L_N3_M16: ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N3_M0 ++ ++ PTR_SRLI T0, M, 4 ++ PTR_SLLI T0, T0, 4 ++ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, A, AA ++ PTR_ADD CC, C, T0 ++.align 5 ++.L_N3_I1: ++ PTR_SLLI T0, KK, 4 ++ PTR_ADD B0, B, T0 ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_SLLI T0, KK, 7 ++ PTR_ADD A0, AA, T0 ++ dgemm_dsolve_16x2 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI KK, KK, -16 ++ PTR_ADDI CC, CC, -(16 * 8) ++ PTR_SLLI T0, K, 7 ++ PTR_SUB AA, AA, T0 ++ blt ZERO, I, .L_N3_I1 ++.L_N3_M0: ++ PTR_SLLI T0, K, 3 ++ PTR_ALSL B, T0, B, 1 // b += 2 * k; ++ PTR_ALSL C, LDC, C, 1 // c += 2 * ldc ++.L_N1: ++ andi J, N, 1 ++ beq ZERO, J, .L_N0 ++ ++ PTR_ADD KK, M, OFFSET ++ andi I, M, 15 ++ beq ZERO, I, .L_N1_M16 ++ andi I, M, 1 ++ beqz I, .L_N1_M2 ++.L_N1_M1: ++ PTR_ADDI KK, KK, -1 ++ ++ PTR_ADDI T0, M, -1 ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ ++ PTR_ADD CC, T0, C /* c + (m - 1) */ ++ ++ PTR_SLLI T0, KK, 3 ++ PTR_ADD B0, B, T0 /* b + kk */ ++ GADD , d, C0, CC, ZERO ++ // dgemm_dsolve_1x1 ++ GLD f, d, $f0, A0, 0, $f1, C0, 0 ++ GMUL f, d, $f1, $f1, $f0 ++ GST f, d, $f1, C0, 0, $f1, B0, 0 ++.L_N1_M2: ++ andi I, M, 2 ++ beqz I, .L_N1_M4 ++ PTR_SRLI T0, M, 1 ++ PTR_SLLI T0, T0, 1 ++ PTR_ADDI T0, T0, -2 ++ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ ++ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ ++ PTR_SLLI T1, KK, 4 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ ++ PTR_SLLI T0, KK, 3 ++ PTR_ADD B0, B, T0 /* b + kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO ++ dgemm_dsolve_2x1 ++ PTR_ADDI KK, KK, -2 ++.L_N1_M4: ++ andi I, M, 4 ++ beqz I, .L_N1_M8 ++ PTR_SRLI T0, M, 2 ++ PTR_SLLI T0, T0, 2 ++ PTR_ADDI T0, T0, -4 ++ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ ++ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ ++ PTR_SLLI T1, KK, 5 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ ++ PTR_SLLI T0, KK, 3 ++ PTR_ADD B0, B, T0 /* b + kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO ++ dgemm_dsolve_4x1 ++ PTR_ADDI KK, KK, -4 ++.L_N1_M8: ++ andi I, M, 8 ++ beqz I, .L_N1_M16 ++ PTR_SRLI T0, M, 3 ++ PTR_SLLI T0, T0, 3 ++ PTR_ADDI T0, T0, -8 ++ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ ++ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ ++ PTR_SLLI T1, KK, 6 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, AA, A ++ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ ++ PTR_SLLI T0, KK, 3 ++ PTR_ADD B0, B, T0 /* b + kk */ ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO ++ dgemm_dsolve_8x1 ++ PTR_ADDI KK, KK, -8 ++.L_N1_M16: ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N1_M0 ++ ++ PTR_SRLI T0, M, 4 ++ PTR_SLLI T0, T0, 4 ++ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ ++ PTR_SLLI T0, T0, 3 ++ PTR_MUL AA, T0, K ++ PTR_ADD AA, A, AA ++ PTR_ADD CC, C, T0 ++.align 5 ++.L_N1_I1: ++ PTR_SLLI T0, KK, 3 ++ PTR_ADD B0, B, T0 ++ PTR_SUB L, K, KK ++ GADD , d, C0, CC, ZERO ++ PTR_SLLI T0, KK, 7 ++ PTR_ADD A0, AA, T0 ++ dgemm_dsolve_16x1 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI KK, KK, -16 ++ PTR_ADDI CC, CC, -(16 * 8) ++ PTR_SLLI T0, K, 7 ++ PTR_SUB AA, AA, T0 ++ blt ZERO, I, .L_N1_I1 ++.L_N1_M0: ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S +new file mode 100644 +index 000000000..0e2cacccf +--- /dev/null ++++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S +@@ -0,0 +1,959 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/26 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, ++ * FLOAT *c, BLASLONG ldc, BLASLONG offset) ++ */ ++ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++#define OFFSET $r11 // param 9: offset ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define T0 $r25 ++#define T1 $r26 ++#define T2 $r27 ++#define KK $r28 ++#define AA $r29 ++#define CC $r30 ++#define BB B0 ++#undef ZERO ++#define ZERO $r0 ++ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++#define G0 D0 ++#define G1 D1 ++#define G2 D2 ++#define G3 D3 ++#define G4 D4 ++#define G5 D5 ++#define G6 D6 ++#define G7 D7 ++#define G8 D8 ++#define G9 D9 ++#define G10 D10 ++#define G11 D11 ++#define G12 D12 ++#define G13 D13 ++#define G14 D14 ++#define G15 D15 ++ ++/* Prefetch interval */ ++#define A_PRE 0x400 ++#define B_PRE 0x100 ++ ++#include "dtrsm_kernel_macro.S" ++ ++.macro ldrepl_macro start, end, stride ++// Load Ux (x = 0...15) ++.if \start <= \end ++ GLDREPL xv, d, $xr\start, A0, \stride * 8 ++ ldrepl_macro %start + 1, \end, %stride + 1 ++.endif ++.endm ++.macro nmsub_macro start0, end0, start1, reg ++// Gx -= reg * Ux ++.if \start0 <= \end0 ++ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 ++ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg ++.endif ++.endm ++.macro B_st_macro start, end, stride, N ++// Store Gx(x = 16...31) ++.if \start <= \end ++.if \N == 4 ++ xvst $xr\start, B0, \stride * 0x20 ++.elseif \N == 2 ++ vst $vr\start, B0, \stride * 0x10 ++.elseif \N == 1 ++ fst.d $f\start, B0, \stride * 0x08 ++.endif ++ B_st_macro %start + 1, \end, %stride + 1, \N ++.endif ++.endm ++ ++.macro dsolve_16 N ++// The data layout of C (4x16) is as follows (store 4 data in each register): ++// U0 U1 U2 U3 ++// U4 U5 U6 U7 ++// U8 U9 U10 U11 ++// U12 U13 U14 U15 ++// The first step is to transpose the result of C ++ GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 ++ GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 ++ GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7 ++ GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7 ++// Now we have the following memory layout of C: ++// 0 1 2 3 ... 15 ++// 0 | | | | | | | ++// 1 | G0 | G1 | G2 | G3 | ... | G15 | ++// 2 | | | | | | | ++// 3 | | | | | | | ++// Next we are going to process matrix A with a size of 16x16, ++// using only the upper triangular portion. The memory layout of ++// matrix A is as follows, quite large. ++//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ++// 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ++// 34 35 36 37 38 39 40 41 42 43 44 45 46 47 ++// 51 52 53 54 55 56 57 58 59 60 61 62 63 ++// 68 69 70 71 72 73 74 75 76 77 78 79 ++// 85 86 87 88 89 90 91 92 93 94 95 ++// 102 103 104 105 106 107 108 109 110 111 ++// 119 120 121 122 123 124 125 126 127 ++// 136 137 138 139 140 141 142 143 ++// 153 154 155 156 157 158 159 ++// 170 171 172 173 174 175 ++// 187 188 189 190 191 ++// 204 205 206 207 ++// 221 222 223 ++// 238 239 ++// 255 ++// Sequentially extract data from A in row order ++// Load 0 ++ ldrepl_macro 0, 15, 0 ++ GMUL xvf, d, G0, G0, U0 ++ nmsub_macro 17, 31, 1, G0 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 1 ++ ldrepl_macro 1, 15, 0 ++ GMUL xvf, d, G1, G1, U1 ++ nmsub_macro 18, 31, 2, G1 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 2 ++ ldrepl_macro 2, 15, 0 ++ GMUL xvf, d, G2, G2, U2 ++ nmsub_macro 19, 31, 3, G2 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 3 ++ ldrepl_macro 3, 15, 0 ++ GMUL xvf, d, G3, G3, U3 ++ nmsub_macro 20, 31, 4, G3 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 4 ++ ldrepl_macro 4, 15, 0 ++ GMUL xvf, d, G4, G4, U4 ++ nmsub_macro 21, 31, 5, G4 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 5 ++ ldrepl_macro 5, 15, 0 ++ GMUL xvf, d, G5, G5, U5 ++ nmsub_macro 22, 31, 6, G5 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 6 ++ ldrepl_macro 6, 15, 0 ++ GMUL xvf, d, G6, G6, U6 ++ nmsub_macro 23, 31, 7, G6 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 7 ++ ldrepl_macro 7, 15, 0 ++ GMUL xvf, d, G7, G7, U7 ++ nmsub_macro 24, 31, 8, G7 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 8 ++ ldrepl_macro 8, 15, 0 ++ GMUL xvf, d, G8, G8, U8 ++ nmsub_macro 25, 31, 9, G8 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 9 ++ ldrepl_macro 9, 15, 0 ++ GMUL xvf, d, G9, G9, U9 ++ nmsub_macro 26, 31, 10, G9 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 10 ++ ldrepl_macro 10, 15, 0 ++ GMUL xvf, d, G10, G10, U10 ++ nmsub_macro 27, 31, 11, G10 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 11 ++ ldrepl_macro 11, 15, 0 ++ GMUL xvf, d, G11, G11, U11 ++ nmsub_macro 28, 31, 12, G11 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 12 ++ ldrepl_macro 12, 15, 0 ++ GMUL xvf, d, G12, G12, U12 ++ nmsub_macro 29, 31, 13, G12 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 13 ++ ldrepl_macro 13, 15, 0 ++ GMUL xvf, d, G13, G13, U13 ++ nmsub_macro 30, 31, 14, G13 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 14 ++ ldrepl_macro 14, 15, 0 ++ GMUL xvf, d, G14, G14, U14 ++ nmsub_macro 31, 31, 15, G14 ++ PTR_ADDI A0, A0, 17 * 8 ++// Load 15 ++ ldrepl_macro 15, 15, 0 ++ GMUL xvf, d, G15, G15, U15 ++// Finally, We can store the result. ++// For B, stored sequentially, and C, first transpose and then store ++ B_st_macro 16, 31, 0, \N ++ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 ++ GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 ++ GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 ++ GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1 ++.if \N == 4 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ ++ G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60, \ ++ G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \ ++ G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60 ++.elseif \N == 2 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ ++ G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60 ++.elseif \N == 1 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60 ++.endif ++.endm ++ ++.macro dgemm_dsolve_16x4 ++ bge ZERO, KK, .L_dsolve_16x4_load ++ dgemm_16x4 ++ b .L_dsolve_16x4 ++.L_dsolve_16x4_load: ++ // Load C ++ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 ++ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++/********************** solver ******************/ ++.L_dsolve_16x4: ++ dsolve_16 4 ++.endm ++ ++.macro dsolve_8 N ++// The data layout of C (4x8) is as follows (store 4 data in each register): ++// U0 U1 ++// U2 U3 ++// U4 U5 ++// U6 U7 ++// The first step is to transpose the result of C ++ GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9 ++ GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9 ++// Now we have the following memory layout of C: ++// 0 1 2 3 ... 7 ++// 0 | | | | | | | ++// 1 | G0 | G1 | G2 | G3 | ... | G7 | ++// 2 | | | | | | | ++// 3 | | | | | | | ++// Next we are going to process matrix A with a size of 8x8, ++// using only the upper triangular portion. The memory layout of ++// matrix A is as follows: ++//0 1 2 3 4 5 6 7 ++// 9 10 11 12 13 14 15 ++// 18 19 20 21 22 23 ++// 27 28 29 30 31 ++// 36 37 38 39 ++// 45 46 47 ++// 54 55 ++// 63 ++// Sequentially extract data from A in row order ++// Load 0 ++ ldrepl_macro 0, 7, 0 ++ GMUL xvf, d, G0, G0, U0 ++ nmsub_macro 17, 23, 1, G0 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 1 ++ ldrepl_macro 1, 7, 0 ++ GMUL xvf, d, G1, G1, U1 ++ nmsub_macro 18, 23, 2, G1 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 2 ++ ldrepl_macro 2, 7, 0 ++ GMUL xvf, d, G2, G2, U2 ++ nmsub_macro 19, 23, 3, G2 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 3 ++ ldrepl_macro 3, 7, 0 ++ GMUL xvf, d, G3, G3, U3 ++ nmsub_macro 20, 23, 4, G3 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 4 ++ ldrepl_macro 4, 7, 0 ++ GMUL xvf, d, G4, G4, U4 ++ nmsub_macro 21, 23, 5, G4 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 5 ++ ldrepl_macro 5, 7, 0 ++ GMUL xvf, d, G5, G5, U5 ++ nmsub_macro 22, 23, 6, G5 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 6 ++ ldrepl_macro 6, 7, 0 ++ GMUL xvf, d, G6, G6, U6 ++ nmsub_macro 23, 23, 7, G6 ++ PTR_ADDI A0, A0, 9 * 8 ++// Load 7 ++ ldrepl_macro 7, 7, 0 ++ GMUL xvf, d, G7, G7, U7 ++// Finally, We can store the result. ++// For B, stored sequentially, and C, first transpose and then store ++ B_st_macro 16, 23, 0, \N ++ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 ++ GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 ++.if \N == 4 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ ++ G1, C1, 0x00, G5, C1, 0x20, \ ++ G2, C2, 0x00, G6, C2, 0x20, \ ++ G3, C3, 0x00, G7, C3, 0x20 ++.elseif \N == 2 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ ++ G1, C1, 0x00, G5, C1, 0x20 ++.elseif \N == 1 ++ GST xv, , G0, C0, 0x00, G4, C0, 0x20 ++.endif ++.endm ++ ++.macro dgemm_dsolve_8x4 ++ bge ZERO, L, .L_dsolve_8x4_load ++ dgemm_8x4 ++ b .L_dsolve_8x4 ++.L_dsolve_8x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ ++ /* Load C2 */ ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ ++ /* Load C3 */ ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++/********* solver *********/ ++.L_dsolve_8x4: ++ dsolve_8 4 ++.endm ++ ++.macro dsolve_4 N ++// The data layout of C (4x4) is as follows (store 4 data in each register): ++// U0 ++// U1 ++// U2 ++// U3 ++// The first step is to transpose the result of C ++ GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5 ++// Now we have the following memory layout of C: ++// 0 1 2 3 ++// 0 | | | | | ++// 1 | G0 | G1 | G2 | G3 | ++// 2 | | | | | ++// 3 | | | | | ++// Next we are going to process matrix A with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix A is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from A in row order ++// Load 0 ++ ldrepl_macro 0, 3, 0 ++ GMUL xvf, d, G0, G0, U0 ++ nmsub_macro 17, 19, 1, G0 ++ PTR_ADDI A0, A0, 5 * 8 ++// Load 1 ++ ldrepl_macro 1, 3, 0 ++ GMUL xvf, d, G1, G1, U1 ++ nmsub_macro 18, 19, 2, G1 ++ PTR_ADDI A0, A0, 5 * 8 ++// Load 2 ++ ldrepl_macro 2, 3, 0 ++ GMUL xvf, d, G2, G2, U2 ++ nmsub_macro 19, 19, 3, G2 ++ PTR_ADDI A0, A0, 5 * 8 ++// Load 3 ++ ldrepl_macro 3, 3, 0 ++ GMUL xvf, d, G3, G3, U3 ++// Finally, We can store the result. ++// For B, stored sequentially, and C, first transpose and then store ++ B_st_macro 16, 19, 0, \N ++ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 ++.if \N == 4 ++ GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 ++.elseif \N == 2 ++ GST xv, , G0, C0, 0x00, G1, C1, 0x00 ++.elseif \N == 1 ++ GST xv, , G0, C0, 0x00 ++.endif ++.endm ++ ++.macro dgemm_dsolve_4x4 ++ bge ZERO, L, .L_dsolve_4x4_load ++ dgemm_4x4 ++ b .L_dsolve_4x4 ++.L_dsolve_4x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/************** solver *****************/ ++.L_dsolve_4x4: ++ dsolve_4 4 ++.endm ++ ++.macro dsolve_2 N ++// Transpose ++ GSBUTTERFLY xv, d, G0, G1, U1, U0 ++// Now we have the following memory layout of C: ++// 0 1 ++// 0 | | | ++// 1 | G0 | G1 | ++// 2 | | | ++// 3 | | | ++// Next we are going to process matrix A with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix A is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from A in row order ++// Load 0 ++ ldrepl_macro 0, 1, 0 ++ GMUL xvf, d, G0, G0, U0 ++ nmsub_macro 17, 17, 1, G0 ++ PTR_ADDI A0, A0, 3 * 8 ++// Load 1 ++ ldrepl_macro 1, 1, 0 ++ GMUL xvf, d, G1, G1, U1 ++// Finally, We can store the result. ++// For B, stored sequentially, and C, first transpose and then store ++ B_st_macro 16, 17, 0, \N ++ GSBUTTERFLY xv, d, U0, U1, G1, G0 ++.if \N == 4 ++ vst $vr0, C0, 0x00 ++ vst $vr1, C1, 0x00 ++ xvstelm.d U0, C2, 0x00, 0x02 ++ xvstelm.d U1, C3, 0x00, 0x02 ++ xvstelm.d U0, C2, 0x08, 0x03 ++ xvstelm.d U1, C3, 0x08, 0x03 ++.elseif \N == 2 ++ vst $vr0, C0, 0x00 ++ vst $vr1, C1, 0x00 ++.elseif \N == 1 ++ vst $vr0, C0, 0x00 ++.endif ++.endm ++ ++.macro dgemm_dsolve_2x4 ++ bge ZERO, L, .L_dsolve_2x4_load ++ dgemm_2x4 ++ b .L_dsolve_2x4 ++.L_dsolve_2x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++ ++ xvpermi.q U0, U2, 0x02 ++ xvpermi.q U1, U3, 0x02 ++/********************** solver ******************/ ++.L_dsolve_2x4: ++ dsolve_2 4 ++.endm ++ ++.macro dgemm_dsolve_1x4 ++ bge ZERO, L, .L_dsolve_1x4_load ++ dgemm_1x4 ++ b .L_dsolve_1x4 ++.L_dsolve_1x4_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ fld.d $f2, C2, 0x00 ++ fld.d $f3, C3, 0x00 ++ xvinsve0.d U0, U1, 0x01 ++ xvinsve0.d U0, U2, 0x02 ++ xvinsve0.d U0, U3, 0x03 ++.L_dsolve_1x4: ++ GLDREPL xv, d, D0, A0, 0x00 ++ GMUL xvf, d, U0, U0, D0 ++ // Store C ++ xvstelm.d U0, C0, 0x00, 0x00 ++ xvstelm.d U0, C1, 0x00, 0x01 ++ xvstelm.d U0, C2, 0x00, 0x02 ++ xvstelm.d U0, C3, 0x00, 0x03 ++ // Store B ++ xvst U0, B0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_16x2 ++ bge ZERO, L, .L_dsolve_16x2_load ++ dgemm_16x2 ++ b .L_dsolve_16x2 ++.L_dsolve_16x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ /* Load C1 */ ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++.L_dsolve_16x2: ++ dsolve_16 2 ++.endm ++ ++.macro dgemm_dsolve_8x2 ++ bge ZERO, L, .L_dsolve_8x2_load ++ dgemm_8x2 ++ b .L_dsolve_8x2 ++.L_dsolve_8x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++.L_dsolve_8x2: ++ dsolve_8 2 ++.endm ++ ++.macro dgemm_dsolve_4x2 ++ bge ZERO, L, .L_dsolve_4x2_load ++ dgemm_4x2 ++ b .L_dsolve_4x2 ++.L_dsolve_4x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_4x2: ++ dsolve_4 2 ++.endm ++ ++.macro dgemm_dsolve_1x2 ++ bge ZERO, L, .L_dsolve_1x2_load ++ dgemm_1x2 ++ b .L_dsolve_1x2 ++.L_dsolve_1x2_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ xvinsve0.d U0, U1, 0x01 ++.L_dsolve_1x2: ++ GLDREPL xv, d, D0, A0, 0x00 ++ GMUL xvf, d, U0, U0, D0 ++ // Store C ++ xvstelm.d U0, C0, 0x00, 0x00 ++ xvstelm.d U0, C1, 0x00, 0x01 ++ // Store B ++ vst $vr0, B0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_2x2 ++ bge ZERO, L, .L_dsolve_2x2_load ++ dgemm_2x2 ++ b .L_dsolve_2x2 ++.L_dsolve_2x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_2x2: ++ dsolve_2 2 ++.endm ++ ++.macro dgemm_dsolve_16x1 ++ bge ZERO, L, .L_dsolve_16x1_load ++ dgemm_16x1 ++ b .L_dsolve_16x1 ++.L_dsolve_16x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++.L_dsolve_16x1: ++ dsolve_16 1 ++.endm ++ ++.macro dgemm_dsolve_8x1 ++ bge ZERO, L, .L_dsolve_8x1_load ++ dgemm_8x1 ++ b .L_dsolve_8x1 ++.L_dsolve_8x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++.L_dsolve_8x1: ++ dsolve_8 1 ++.endm ++ ++.macro dgemm_dsolve_4x1 ++ bge ZERO, L, .L_dsolve_4x1_load ++ dgemm_4x1 ++ b .L_dsolve_4x1 ++.L_dsolve_4x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_4x1: ++ dsolve_4 1 ++.endm ++ ++.macro dgemm_dsolve_2x1 ++ bge ZERO, L, .L_dsolve_2x1_load ++ dgemm_2x1 ++ b .L_dsolve_2x1 ++.L_dsolve_2x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_2x1: ++ dsolve_2 1 ++.endm ++ ++.macro dgemm_dsolve_1x1 ++ bge ZERO, L, .L_dsolve_1x1_load ++ dgemm_1x1 ++ b .L_dsolve_1x1 ++.L_dsolve_1x1_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++.L_dsolve_1x1: ++ GLDREPL xv, d, D0, A0, 0x00 ++ GMUL xvf, d, U0, U0, D0 ++ // Store C ++ xvstelm.d U0, C0, 0x00, 0x00 ++ // Store B ++ xvstelm.d U0, B0, 0x00, 0x00 ++.endm ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ PTR_SLLI LDC, LDC, 3 ++ /* if (!(N >> 2)) goto L_N3 */ ++ PTR_SRAI J, N, 2 /* J = bn >> 2 */ ++ andi N, N, 0x03 ++ beq ZERO, J, .L_N3 ++.align 5 ++.L_J1: ++ PTR_ADDI J, J, -1 ++ move KK, OFFSET ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_M15 ++.align 4 ++.L_I1: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x4 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADDI KK, KK, 0x10 // kk += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_I1 ++.L_M15: ++ andi I, M, 8 ++ beqz I, .L_M7 ++.L_M8: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x4 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADDI KK, KK, 0x08 // kk += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_M7: ++ andi I, M, 4 ++ beqz I, .L_M3 ++.L_M4: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x4 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADDI KK, KK, 0x04 // kk += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_M3: ++ andi I, M, 2 ++ beqz I, .L_M1 ++.L_M2: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x4 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADDI KK, KK, 0x02 // kk += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_M1: ++ andi I, M, 1 ++ beqz I, .L_M0 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x4 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADDI KK, KK, 0x01 // kk += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_M0: ++ PTR_SLLI T0, K, 5 ++ PTR_SLLI T1, LDC, 2 ++ PTR_ADD B, B, T0 // b += 4 * k ++ PTR_ADD C, C, T1 // c += 4 * ldc ++ bnez J, .L_J1 ++.L_N3: ++ andi J, N, 2 ++ beq ZERO, J, .L_N1 ++.L_N2: ++ move KK, OFFSET ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N2_M15 ++.align 4 ++.L_N2_I1: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x2 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADDI KK, KK, 0x10 // kk += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N2_I1 ++.L_N2_M15: ++ andi I, M, 8 ++ beqz I, .L_N2_M7 ++.L_N2_M8: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x2 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADDI KK, KK, 0x08 // kk += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N2_M7: ++ andi I, M, 4 ++ beqz I, .L_N2_M3 ++.L_N2_M4: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x2 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADDI KK, KK, 0x04 // kk += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N2_M3: ++ andi I, M, 2 ++ beqz I, .L_N2_M1 ++.L_N2_M2: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x2 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADDI KK, KK, 0x02 // kk += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N2_M1: ++ andi I, M, 1 ++ beqz I, .L_N2_M0 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x2 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADDI KK, KK, 0x01 // kk += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N2_M0: ++ PTR_SLLI T0, K, 4 ++ PTR_SLLI T1, LDC, 1 ++ PTR_ADD B, B, T0 // b += 2 * k ++ PTR_ADD C, C, T1 // c += 2 * ldc ++.L_N1: ++ andi J, N, 1 ++ beq ZERO, J, .L_N0 ++ ++ move KK, OFFSET ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N1_M15 ++.align 4 ++.L_N1_I1: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x1 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADDI KK, KK, 0x10 // kk += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N1_I1 ++.L_N1_M15: ++ andi I, M, 8 ++ beqz I, .L_N1_M7 ++.L_N1_M8: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x1 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADDI KK, KK, 0x08 // kk += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N1_M7: ++ andi I, M, 4 ++ beqz I, .L_N1_M3 ++.L_N1_M4: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x1 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADDI KK, KK, 0x04 // kk += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N1_M3: ++ andi I, M, 2 ++ beqz I, .L_N1_M1 ++.L_N1_M2: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x1 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADDI KK, KK, 0x02 // kk += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N1_M1: ++ andi I, M, 1 ++ beqz I, .L_N1_M0 ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x1 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADDI KK, KK, 0x01 // kk += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N1_M0: ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S +new file mode 100644 +index 000000000..421339736 +--- /dev/null ++++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S +@@ -0,0 +1,882 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/09/26 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, ++ * FLOAT *c, BLASLONG ldc, BLASLONG offset) ++ */ ++ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++#define OFFSET $r11 // param 9: offset ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define T0 $r25 ++#define T1 $r26 ++#define T2 $r27 ++#define KK $r28 ++#define AA $r29 ++#define CC $r30 ++#define BB B0 ++#undef ZERO ++#define ZERO $r0 ++ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++#define G0 D0 ++#define G1 D1 ++#define G2 D2 ++#define G3 D3 ++#define G4 D4 ++#define G5 D5 ++#define G6 D6 ++#define G7 D7 ++#define G8 D8 ++#define G9 D9 ++#define G10 D10 ++#define G11 D11 ++#define G12 D12 ++#define G13 D13 ++#define G14 D14 ++#define G15 D15 ++ ++/* Prefetch interval */ ++#define A_PRE 0x400 ++#define B_PRE 0x100 ++ ++#include "dtrsm_kernel_macro.S" ++ ++.macro ldrepl_macro start, end, stride ++// Load Ux (x = 0...15) ++.if \start <= \end ++ GLDREPL xv, d, $xr\start, B0, \stride * 8 ++ ldrepl_macro %start + 1, \end, %stride + 1 ++.endif ++.endm ++ ++.macro nmsub_macro start0, end0, start1, reg ++// Ux -= reg * Dx ++.if \start0 <= \end0 ++ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 ++ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg ++.endif ++.endm ++ ++.macro A_st_macro start, end, stride, N ++// Store Ux(x = 0...15) ++.if \start <= \end ++.if \N == 4 ++ xvst $xr\start, A0, \stride * 0x20 ++.elseif \N == 2 ++ vst $vr\start, A0, \stride * 0x10 ++.elseif \N == 1 ++ fst.d $f\start, A0, \stride * 0x08 ++.endif ++ A_st_macro %start + 1, \end, %stride + 1, \N ++.endif ++.endm ++ ++.macro dsolve_16x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 19, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++ ldrepl_macro 20, 22, 5 ++ nmsub_macro 4, 7, 0, D1 ++ ldrepl_macro 23, 24, 10 ++ GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 ++ ldrepl_macro 25, 25, 15 ++ nmsub_macro 8, 11, 0, D2 ++ nmsub_macro 8, 11, 4, D5 ++ GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 ++ nmsub_macro 12, 15, 0, D3 ++ nmsub_macro 12, 15, 4, D6 ++ nmsub_macro 12, 15, 8, D8 ++ GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 ++// Store A ++ A_st_macro 0, 15, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ ++ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ ++ U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ ++ U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++.endm ++ ++.macro dsolve_16x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 17, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++ ldrepl_macro 18, 18, 3 ++ nmsub_macro 4, 7, 0, D1 ++ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 ++// Store A ++ A_st_macro 0, 7, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ ++ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++.endm ++ ++.macro dsolve_8x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 19, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++ ldrepl_macro 20, 22, 5 ++ nmsub_macro 2, 3, 0, D1 ++ ldrepl_macro 23, 24, 10 ++ GMUL xvf, d, U2, D4, U2, U3, D4, U3 ++ ldrepl_macro 25, 25, 15 ++ nmsub_macro 4, 5, 0, D2 ++ nmsub_macro 4, 5, 2, D5 ++ GMUL xvf, d, U4, D7, U4, U5, D7, U5 ++ nmsub_macro 6, 7, 0, D3 ++ nmsub_macro 6, 7, 2, D6 ++ nmsub_macro 6, 7, 4, D8 ++ GMUL xvf, d, U6, D9, U6, U7, D9, U7 ++// Store A ++ A_st_macro 0, 7, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ ++ U2, C1, 0x00, U3, C1, 0x20, \ ++ U4, C2, 0x00, U5, C2, 0x20, \ ++ U6, C3, 0x00, U7, C3, 0x20 ++.endm ++ ++.macro dsolve_8x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 17, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++ ldrepl_macro 18, 18, 3 ++ nmsub_macro 2, 3, 0, D1 ++ GMUL xvf, d, U2, D2, U2, U3, D2, U3 ++// Store A ++ A_st_macro 0, 3, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ ++ U2, C1, 0x00, U3, C1, 0x20 ++.endm ++ ++.macro dsolve_4x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 19, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 20, 22, 5 ++ nmsub_macro 1, 1, 0, D1 ++ ldrepl_macro 23, 24, 10 ++ GMUL xvf, d, U1, D4, U1 ++ ldrepl_macro 25, 25, 15 ++ nmsub_macro 2, 2, 0, D2 ++ nmsub_macro 2, 2, 1, D5 ++ GMUL xvf, d, U2, D7, U2 ++ nmsub_macro 3, 3, 0, D3 ++ nmsub_macro 3, 3, 1, D6 ++ nmsub_macro 3, 3, 2, D8 ++ GMUL xvf, d, U3, D9, U3 ++// Store A ++ A_st_macro 0, 3, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 ++.endm ++ ++.macro dsolve_4x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 17, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 18, 18, 3 ++ nmsub_macro 1, 1, 0, D1 ++ GMUL xvf, d, U1, D2, U1 ++// Store A ++ A_st_macro 0, 1, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C1, 0x00 ++.endm ++ ++.macro dsolve_2x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 19, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 20, 22, 5 ++ nmsub_macro 1, 1, 0, D1 ++ ldrepl_macro 23, 24, 10 ++ GMUL xvf, d, U1, D4, U1 ++ ++ ldrepl_macro 25, 25, 15 ++ nmsub_macro 2, 2, 0, D2 ++ nmsub_macro 2, 2, 1, D5 ++ GMUL xvf, d, U2, D7, U2 ++ nmsub_macro 3, 3, 0, D3 ++ nmsub_macro 3, 3, 1, D6 ++ nmsub_macro 3, 3, 2, D8 ++ GMUL xvf, d, U3, D9, U3 ++// Store A ++ A_st_macro 0, 3, 0, 2 ++// Store C ++ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, ++.endm ++ ++.macro dsolve_2x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 17, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 18, 18, 3 ++ nmsub_macro 1, 1, 0, D1 ++ GMUL xvf, d, U1, D2, U1 ++// Store A ++ A_st_macro 0, 1, 0, 2 ++// Store C ++ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 ++.endm ++ ++.macro dsolve_1x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 2 3 ++// 5 6 7 ++// 10 11 ++// 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 19, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 20, 22, 5 ++ nmsub_macro 1, 1, 0, D1 ++ ldrepl_macro 23, 24, 10 ++ GMUL xvf, d, U1, D4, U1 ++ ++ ldrepl_macro 25, 25, 15 ++ nmsub_macro 2, 2, 0, D2 ++ nmsub_macro 2, 2, 1, D5 ++ GMUL xvf, d, U2, D7, U2 ++ nmsub_macro 3, 3, 0, D3 ++ nmsub_macro 3, 3, 1, D6 ++ nmsub_macro 3, 3, 2, D8 ++ GMUL xvf, d, U3, D9, U3 ++// Store A ++ A_st_macro 0, 3, 0, 1 ++// Store C ++ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, ++.endm ++ ++.macro dsolve_1x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 1 ++// 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 17, 0 ++ GMUL xvf, d, U0, D0, U0 ++ ldrepl_macro 18, 18, 3 ++ nmsub_macro 1, 1, 0, D1 ++ GMUL xvf, d, U1, D2, U1 ++// Store A ++ A_st_macro 0, 1, 0, 1 ++// Store C ++ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 ++.endm ++ ++.macro dgemm_dsolve_16x4 ++ bge ZERO, L, .L_dsolve_16x4_load ++ dgemm_16x4 ++ b .L_dsolve_16x4 ++.L_dsolve_16x4_load: ++ // Load C ++ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 ++ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++/********************** solver ******************/ ++.L_dsolve_16x4: ++ dsolve_16x4 ++.endm ++ ++.macro dgemm_dsolve_8x4 ++ bge ZERO, L, .L_dsolve_8x4_load ++ dgemm_8x4 ++ b .L_dsolve_8x4 ++.L_dsolve_8x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ ++ /* Load C2 */ ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ ++ /* Load C3 */ ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++/********* solver *********/ ++.L_dsolve_8x4: ++ dsolve_8x4 ++.endm ++ ++.macro dgemm_dsolve_4x4 ++ bge ZERO, L, .L_dsolve_4x4_load ++ dgemm_4x4 ++ b .L_dsolve_4x4 ++.L_dsolve_4x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/************** solver *****************/ ++.L_dsolve_4x4: ++ dsolve_4x4 ++.endm ++ ++.macro dgemm_dsolve_2x4 ++ bge ZERO, L, .L_dsolve_2x4_load ++ dgemm_2x4 ++ xvpermi.q U2, U0, 0x01 ++ xvpermi.q U3, U1, 0x01 ++ b .L_dsolve_2x4 ++.L_dsolve_2x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/********************** solver ******************/ ++.L_dsolve_2x4: ++ dsolve_2x4 ++.endm ++ ++.macro dgemm_dsolve_1x4 ++ bge ZERO, L, .L_dsolve_1x4_load ++ dgemm_1x4 ++ xvpackod.d U1, U0, U0 ++ xvpermi.q U2, U0, 0x01 ++ xvpermi.q U3, U1, 0x01 ++ b .L_dsolve_1x4 ++.L_dsolve_1x4_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ fld.d $f2, C2, 0x00 ++ fld.d $f3, C3, 0x00 ++.L_dsolve_1x4: ++ dsolve_1x4 ++.endm ++ ++.macro dgemm_dsolve_16x2 ++ bge ZERO, L, .L_dsolve_16x2_load ++ dgemm_16x2 ++ b .L_dsolve_16x2 ++.L_dsolve_16x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ /* Load C1 */ ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++.L_dsolve_16x2: ++ dsolve_16x2 ++.endm ++ ++.macro dgemm_dsolve_8x2 ++ bge ZERO, L, .L_dsolve_8x2_load ++ dgemm_8x2 ++ b .L_dsolve_8x2 ++.L_dsolve_8x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++.L_dsolve_8x2: ++ dsolve_8x2 ++.endm ++ ++.macro dgemm_dsolve_4x2 ++ bge ZERO, L, .L_dsolve_4x2_load ++ dgemm_4x2 ++ b .L_dsolve_4x2 ++.L_dsolve_4x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_4x2: ++ dsolve_4x2 ++.endm ++ ++.macro dgemm_dsolve_2x2 ++ bge ZERO, L, .L_dsolve_2x2_load ++ dgemm_2x2 ++ b .L_dsolve_2x2 ++.L_dsolve_2x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_2x2: ++ dsolve_2x2 ++.endm ++ ++.macro dgemm_dsolve_1x2 ++ bge ZERO, L, .L_dsolve_1x2_load ++ dgemm_1x2 ++ xvpackod.d U1, U0, U0 ++ b .L_dsolve_1x2 ++.L_dsolve_1x2_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++.L_dsolve_1x2: ++ dsolve_1x2 ++.endm ++ ++.macro dgemm_dsolve_16x1 ++ bge ZERO, L, .L_dsolve_16x1_load ++ dgemm_16x1 ++ b .L_dsolve_16x1 ++.L_dsolve_16x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++.L_dsolve_16x1: ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++ // Store A ++ A_st_macro 0, 3, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++.endm ++ ++.macro dgemm_dsolve_8x1 ++ bge ZERO, L, .L_dsolve_8x1_load ++ dgemm_8x1 ++ b .L_dsolve_8x1 ++.L_dsolve_8x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++.L_dsolve_8x1: ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++ // Store A ++ A_st_macro 0, 1, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20 ++.endm ++ ++.macro dgemm_dsolve_4x1 ++ bge ZERO, L, .L_dsolve_4x1_load ++ dgemm_4x1 ++ b .L_dsolve_4x1 ++.L_dsolve_4x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_4x1: ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_2x1 ++ bge ZERO, L, .L_dsolve_2x1_load ++ dgemm_2x1 ++ b .L_dsolve_2x1 ++.L_dsolve_2x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_2x1: ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 2 ++ // Strore C ++ GST v, , $vr0, C0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_1x1 ++ bge ZERO, L, .L_dsolve_1x1_load ++ dgemm_1x1 ++ b .L_dsolve_1x1 ++.L_dsolve_1x1_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++.L_dsolve_1x1: ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 1 ++ // Strore C ++ GST f, d, $f0, C0, 0x00 ++.endm ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ PTR_SLLI LDC, LDC, 3 ++ PTR_SUB KK, ZERO, OFFSET ++ /* if (!(N >> 2)) goto L_N3 */ ++ PTR_SRAI J, N, 2 /* J = bn >> 2 */ ++ andi N, N, 0x03 ++ beq ZERO, J, .L_N3 ++.align 5 ++.L_J1: ++ PTR_ADDI J, J, -1 ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_M15 ++.align 4 ++.L_I1: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x4 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_I1 ++.L_M15: ++ andi I, M, 8 ++ beqz I, .L_M7 ++.L_M8: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x4 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_M7: ++ andi I, M, 4 ++ beqz I, .L_M3 ++.L_M4: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x4 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_M3: ++ andi I, M, 2 ++ beqz I, .L_M1 ++.L_M2: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x4 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_M1: ++ andi I, M, 1 ++ beqz I, .L_M0 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x4 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_M0: ++ PTR_SLLI T0, K, 5 ++ PTR_SLLI T1, LDC, 2 ++ PTR_ADD B, B, T0 // b += 4 * k ++ PTR_ADD C, C, T1 // c += 4 * ldc ++ PTR_ADDI KK, KK, 4 // kk += 4 ++ bnez J, .L_J1 ++.L_N3: ++ andi J, N, 2 ++ beq ZERO, J, .L_N1 ++.L_N2: ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N2_M15 ++.align 4 ++.L_N2_I1: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x2 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N2_I1 ++.L_N2_M15: ++ andi I, M, 8 ++ beqz I, .L_N2_M7 ++.L_N2_M8: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x2 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N2_M7: ++ andi I, M, 4 ++ beqz I, .L_N2_M3 ++.L_N2_M4: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x2 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N2_M3: ++ andi I, M, 2 ++ beqz I, .L_N2_M1 ++.L_N2_M2: ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x2 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N2_M1: ++ andi I, M, 1 ++ beqz I, .L_N2_M0 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x2 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N2_M0: ++ PTR_SLLI T0, K, 4 ++ PTR_SLLI T1, LDC, 1 ++ PTR_ADD B, B, T0 // b += 2 * k ++ PTR_ADD C, C, T1 // c += 2 * ldc ++ PTR_ADDI KK, KK, 2 // kk += 2 ++.L_N1: ++ andi J, N, 1 ++ beq ZERO, J, .L_N0 ++ move AA, A ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N1_M15 ++.align 4 ++.L_N1_I1: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_16x1 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N1_I1 ++.L_N1_M15: ++ andi I, M, 8 ++ beqz I, .L_N1_M7 ++.L_N1_M8: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_8x1 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N1_M7: ++ andi I, M, 4 ++ beqz I, .L_N1_M3 ++.L_N1_M4: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_4x1 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N1_M3: ++ andi I, M, 2 ++ beqz I, .L_N1_M1 ++.L_N1_M2: ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_2x1 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N1_M1: ++ andi I, M, 1 ++ beqz I, .L_N1_M0 ++ GADD , d, C0, CC, ZERO ++ move A0, AA ++ move B0, B ++ move L, KK ++ dgemm_dsolve_1x1 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N1_M0: ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S +new file mode 100644 +index 000000000..5f86d75b5 +--- /dev/null ++++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S +@@ -0,0 +1,953 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/09/26 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, ++ * FLOAT *c, BLASLONG ldc, BLASLONG offset) ++ */ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++#define OFFSET $r11 // param 9: offset ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define T0 $r25 ++#define T1 $r26 ++#define T2 $r27 ++#define KK $r28 ++#define AA $r29 ++#define CC $r30 ++#define BB $r31 ++#undef ZERO ++#define ZERO $r0 ++ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++ ++/* Prefetch interval */ ++#define A_PRE 0x400 ++#define B_PRE 0x100 ++ ++#include "dtrsm_kernel_macro.S" ++ ++.macro ldrepl_macro start, end, stride ++// Load Ux (x = 0...15) ++.if \start <= \end ++ GLDREPL xv, d, $xr\start, B0, \stride * 8 ++ ldrepl_macro %start + 1, \end, %stride + 1 ++.endif ++.endm ++ ++.macro nmsub_macro start0, end0, start1, reg ++// Ux -= reg * Dx ++.if \start0 <= \end0 ++ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 ++ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg ++.endif ++.endm ++ ++.macro A_st_macro start, end, stride, N ++// Store Ux(x = 0...15) ++.if \start <= \end ++.if \N == 4 ++ xvst $xr\start, A0, \stride * 0x20 ++.elseif \N == 2 ++ vst $vr\start, A0, \stride * 0x10 ++.elseif \N == 1 ++ fst.d $f\start, A0, \stride * 0x08 ++.endif ++ A_st_macro %start + 1, \end, %stride + 1, \N ++.endif ++.endm ++ ++.macro dsolve_16x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//2 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 16, 0 ++ ldrepl_macro 17, 18, 2 ++ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 ++ nmsub_macro 0, 3, 4, D1 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++// Store A ++ A_st_macro 0, 7, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ ++ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++.endm ++ ++.macro dsolve_8x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//2 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 16, 0 ++ ldrepl_macro 17, 18, 2 ++ GMUL xvf, d, U2, D2, U2, U3, D2, U3 ++ nmsub_macro 0, 1, 2, D1 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++// Store A ++ A_st_macro 0, 3, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ ++ U2, C1, 0x00, U3, C1, 0x20 ++.endm ++ ++.macro dsolve_4x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//2 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 16, 0 ++ ldrepl_macro 17, 18, 2 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 1, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C1, 0x00 ++.endm ++ ++.macro dsolve_2x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//2 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 16, 0 ++ ldrepl_macro 17, 18, 2 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 1, 0, 2 ++// Store C ++ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 ++.endm ++ ++.macro dsolve_1x2 ++// We are going to process matrix B with a size of 2x2, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//2 3 ++// Sequentially extract data from B in row order ++ ldrepl_macro 16, 16, 0 ++ ldrepl_macro 17, 18, 2 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 1, 0, 1 ++// Store C ++ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 ++.endm ++ ++.macro dsolve_16x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//4 5 ++//8 9 10 ++//12 13 14 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 22, 25, 12 ++ GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 ++ ldrepl_macro 19, 21, 8 ++ nmsub_macro 8, 11, 12, D8 ++ ldrepl_macro 17, 18, 4 ++ GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 ++ ldrepl_macro 16, 16, 0 ++ nmsub_macro 4, 7, 12, D7 ++ nmsub_macro 4, 7, 8, D4 ++ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 ++ nmsub_macro 0, 3, 12, D6 ++ nmsub_macro 0, 3, 8, D3 ++ nmsub_macro 0, 3, 4, D1 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++// Store A ++ A_st_macro 0, 15, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ ++ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ ++ U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ ++ U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++.endm ++ ++.macro dsolve_8x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//4 5 ++//8 9 10 ++//12 13 14 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 22, 25, 12 ++ GMUL xvf, d, U6, D9, U6, U7, D9, U7 ++ ldrepl_macro 19, 21, 8 ++ nmsub_macro 4, 5, 6, D8 ++ ldrepl_macro 17, 18, 4 ++ GMUL xvf, d, U4, D5, U4, U5, D5, U5 ++ ldrepl_macro 16, 16, 0 ++ nmsub_macro 2, 3, 6, D7 ++ nmsub_macro 2, 3, 4, D4 ++ GMUL xvf, d, U2, D2, U2, U3, D2, U3 ++ nmsub_macro 0, 1, 6, D6 ++ nmsub_macro 0, 1, 4, D3 ++ nmsub_macro 0, 1, 2, D1 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++// Store A ++ A_st_macro 0, 7, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ ++ U2, C1, 0x00, U3, C1, 0x20, \ ++ U4, C2, 0x00, U5, C2, 0x20, \ ++ U6, C3, 0x00, U7, C3, 0x20 ++.endm ++ ++.macro dsolve_4x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//4 5 ++//8 9 10 ++//12 13 14 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 22, 25, 12 ++ GMUL xvf, d, U3, D9, U3 ++ ldrepl_macro 19, 21, 8 ++ nmsub_macro 2, 2, 3, D8 ++ ldrepl_macro 17, 18, 4 ++ GMUL xvf, d, U2, D5, U2 ++ ldrepl_macro 16, 16, 0 ++ nmsub_macro 1, 1, 3, D7 ++ nmsub_macro 1, 1, 2, D4 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 3, D6 ++ nmsub_macro 0, 0, 2, D3 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 3, 0, 4 ++// Store C ++ GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 ++.endm ++ ++.macro dsolve_2x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//4 5 ++//8 9 10 ++//12 13 14 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 22, 25, 12 ++ GMUL xvf, d, U3, D9, U3 ++ ldrepl_macro 19, 21, 8 ++ nmsub_macro 2, 2, 3, D8 ++ ldrepl_macro 17, 18, 4 ++ GMUL xvf, d, U2, D5, U2 ++ ldrepl_macro 16, 16, 0 ++ nmsub_macro 1, 1, 3, D7 ++ nmsub_macro 1, 1, 2, D4 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 3, D6 ++ nmsub_macro 0, 0, 2, D3 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 3, 0, 2 ++// Store C ++ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 ++.endm ++ ++.macro dsolve_1x4 ++// We are going to process matrix B with a size of 4x4, ++// using only the upper triangular portion. The memory layout of ++// matrix B is as follows: ++//0 ++//4 5 ++//8 9 10 ++//12 13 14 15 ++// Sequentially extract data from B in row order ++ ldrepl_macro 22, 25, 12 ++ GMUL xvf, d, U3, D9, U3 ++ ldrepl_macro 19, 21, 8 ++ nmsub_macro 2, 2, 3, D8 ++ ldrepl_macro 17, 18, 4 ++ GMUL xvf, d, U2, D5, U2 ++ ldrepl_macro 16, 16, 0 ++ nmsub_macro 1, 1, 3, D7 ++ nmsub_macro 1, 1, 2, D4 ++ GMUL xvf, d, U1, D2, U1 ++ nmsub_macro 0, 0, 3, D6 ++ nmsub_macro 0, 0, 2, D3 ++ nmsub_macro 0, 0, 1, D1 ++ GMUL xvf, d, U0, D0, U0 ++// Store A ++ A_st_macro 0, 3, 0, 1 ++// Store C ++ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, ++.endm ++ ++.macro dgemm_dsolve_16x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x1_load ++ dgemm_16x1 ++ b .L_dsolve_16x1 ++.L_dsolve_16x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++.L_dsolve_16x1: ++ PTR_ADDI A0, T1, -16 * 8 ++ PTR_ADDI B0, T2, -1 * 8 ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 ++ // Store A ++ A_st_macro 0, 3, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++.endm ++ ++.macro dgemm_dsolve_8x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x1_load ++ dgemm_8x1 ++ b .L_dsolve_8x1 ++.L_dsolve_8x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++.L_dsolve_8x1: ++ PTR_ADDI A0, T1, -8 * 8 ++ PTR_ADDI B0, T2, -1 * 8 ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0, U1, D0, U1 ++ // Store A ++ A_st_macro 0, 1, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00, U1, C0, 0x20 ++.endm ++ ++.macro dgemm_dsolve_4x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x1_load ++ dgemm_4x1 ++ b .L_dsolve_4x1 ++.L_dsolve_4x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_4x1: ++ PTR_ADDI A0, T1, -4 * 8 ++ PTR_ADDI B0, T2, -1 * 8 ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 4 ++ // Strore C ++ GST xv, , U0, C0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_2x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x1_load ++ dgemm_2x1 ++ b .L_dsolve_2x1 ++.L_dsolve_2x1_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++.L_dsolve_2x1: ++ PTR_ADDI A0, T1, -2 * 8 ++ PTR_ADDI B0, T2, -1 * 8 ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 2 ++ // Strore C ++ GST v, , $vr0, C0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_1x1 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_1x1_load ++ dgemm_1x1 ++ b .L_dsolve_1x1 ++.L_dsolve_1x1_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++.L_dsolve_1x1: ++ PTR_ADDI A0, T1, -1 * 8 ++ PTR_ADDI B0, T2, -1 * 8 ++ ldrepl_macro 16, 16, 0 ++ GMUL xvf, d, U0, D0, U0 ++ // Store A ++ A_st_macro 0, 0, 0, 1 ++ // Strore C ++ GST f, d, $f0, C0, 0x00 ++.endm ++ ++.macro dgemm_dsolve_16x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x2_load ++ dgemm_16x2 ++ b .L_dsolve_16x2 ++.L_dsolve_16x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ /* Load C1 */ ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++.L_dsolve_16x2: ++ PTR_ADDI A0, T1, -(16 * 2) * 8 ++ PTR_ADDI B0, T2, -(2 * 2) * 8 ++ dsolve_16x2 ++.endm ++ ++.macro dgemm_dsolve_8x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x2_load ++ dgemm_8x2 ++ b .L_dsolve_8x2 ++.L_dsolve_8x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++.L_dsolve_8x2: ++ PTR_ADDI A0, T1, -(8 * 2) * 8 ++ PTR_ADDI B0, T2, -(2 * 2) * 8 ++ dsolve_8x2 ++.endm ++ ++.macro dgemm_dsolve_4x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x2_load ++ dgemm_4x2 ++ b .L_dsolve_4x2 ++.L_dsolve_4x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_4x2: ++ PTR_ADDI A0, T1, -(4 * 2) * 8 ++ PTR_ADDI B0, T2, -(2 * 2) * 8 ++ dsolve_4x2 ++.endm ++ ++.macro dgemm_dsolve_2x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x2_load ++ dgemm_2x2 ++ b .L_dsolve_2x2 ++.L_dsolve_2x2_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++.L_dsolve_2x2: ++ PTR_ADDI A0, T1, -(2 * 2) * 8 ++ PTR_ADDI B0, T2, -(2 * 2) * 8 ++ dsolve_2x2 ++.endm ++ ++.macro dgemm_dsolve_1x2 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_1x2_load ++ dgemm_1x2 ++ xvpackod.d U1, U0, U0 ++ b .L_dsolve_1x2 ++.L_dsolve_1x2_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++.L_dsolve_1x2: ++ PTR_ADDI A0, T1, -(1 * 2) * 8 ++ PTR_ADDI B0, T2, -(2 * 2) * 8 ++ dsolve_1x2 ++.endm ++ ++.macro dgemm_dsolve_16x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_16x4_load ++ dgemm_16x4 ++ b .L_dsolve_16x4 ++.L_dsolve_16x4_load: ++ // Load C ++ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 ++ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++/********************** solver ******************/ ++.L_dsolve_16x4: ++ PTR_ADDI A0, T1, -(16 * 4) * 8 ++ PTR_ADDI B0, T2, -(4 * 4) * 8 ++ dsolve_16x4 ++.endm ++ ++.macro dgemm_dsolve_8x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_8x4_load ++ dgemm_8x4 ++ b .L_dsolve_8x4 ++.L_dsolve_8x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ ++ /* Load C2 */ ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ ++ /* Load C3 */ ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++/********* solver *********/ ++.L_dsolve_8x4: ++ PTR_ADDI A0, T1, -(8 * 4) * 8 ++ PTR_ADDI B0, T2, -(4 * 4) * 8 ++ dsolve_8x4 ++.endm ++ ++.macro dgemm_dsolve_4x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_4x4_load ++ dgemm_4x4 ++ b .L_dsolve_4x4 ++.L_dsolve_4x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/************** solver *****************/ ++.L_dsolve_4x4: ++ PTR_ADDI A0, T1, -(4 * 4) * 8 ++ PTR_ADDI B0, T2, -(4 * 4) * 8 ++ dsolve_4x4 ++.endm ++ ++.macro dgemm_dsolve_2x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_2x4_load ++ dgemm_2x4 ++ xvpermi.q U2, U0, 0x01 ++ xvpermi.q U3, U1, 0x01 ++ b .L_dsolve_2x4 ++.L_dsolve_2x4_load: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++/********************** solver ******************/ ++.L_dsolve_2x4: ++ PTR_ADDI A0, T1, -(2 * 4) * 8 ++ PTR_ADDI B0, T2, -(4 * 4) * 8 ++ dsolve_2x4 ++.endm ++ ++.macro dgemm_dsolve_1x4 ++ or T1, A0, A0 ++ or T2, B0, B0 ++ bge ZERO, L, .L_dsolve_1x4_load ++ dgemm_1x4 ++ xvpackod.d U1, U0, U0 ++ xvpermi.q U2, U0, 0x01 ++ xvpermi.q U3, U1, 0x01 ++ b .L_dsolve_1x4 ++.L_dsolve_1x4_load: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ fld.d $f2, C2, 0x00 ++ fld.d $f3, C3, 0x00 ++.L_dsolve_1x4: ++ PTR_ADDI A0, T1, -(1 * 4) * 8 ++ PTR_ADDI B0, T2, -(4 * 4) * 8 ++ dsolve_1x4 ++.endm ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ PTR_SLLI LDC, LDC, 3 ++ PTR_SUB KK, N, OFFSET ++ PTR_MUL T0, N, LDC ++ PTR_MUL T1, N, K ++ PTR_ADD C, C, T0 // c += n * ldc ++ PTR_SLLI T1, T1, 3 ++ PTR_ADD B, B, T1 ++ ++ andi J, N, 1 ++ beqz J, .L_N2 ++.L_N1: ++ move AA, A ++ PTR_SUB C, C, LDC // c -= ldc ++ PTR_SLLI T0, K, 3 ++ PTR_SLLI T1, KK, 3 ++ PTR_SUB B, B, T0 // b -= k ++ PTR_ADD BB, B, T1 // bb = b + kk ++ move CC, C ++ ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N1_M15 ++.align 4 ++.L_N1_I1: ++ PTR_SLLI T1, KK, 7 ++ GADD , d, C0, CC, ZERO ++ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_16x1 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N1_I1 ++.L_N1_M15: ++ andi I, M, 8 ++ beqz I, .L_N1_M7 ++.L_N1_M8: ++ PTR_SLLI T1, KK, 6 ++ GADD , d, C0, CC, ZERO ++ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_8x1 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N1_M7: ++ andi I, M, 4 ++ beqz I, .L_N1_M3 ++.L_N1_M4: ++ PTR_SLLI T1, KK, 5 ++ GADD , d, C0, CC, ZERO ++ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_4x1 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N1_M3: ++ andi I, M, 2 ++ beqz I, .L_N1_M1 ++.L_N1_M2: ++ PTR_SLLI T1, KK, 4 ++ GADD , d, C0, CC, ZERO ++ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_2x1 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N1_M1: ++ andi I, M, 1 ++ beqz I, .L_N1_M0 ++ PTR_SLLI T1, KK, 3 ++ GADD , d, C0, CC, ZERO ++ PTR_ADD A0, AA, T1 // a0 = aa + kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_1x1 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N1_M0: ++ PTR_ADDI KK, KK, -1 ++.L_N2: ++ andi J, N, 2 ++ beq ZERO, J, .L_N4 ++ move AA, A ++ PTR_SLLI T0, LDC, 1 ++ PTR_SLLI T1, K, 4 ++ PTR_SLLI T2, KK, 4 ++ PTR_SUB B, B, T1 ++ PTR_SUB C, C, T0 ++ PTR_ADD BB, B, T2 ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_N2_M15 ++.align 4 ++.L_N2_I1: ++ PTR_SLLI T1, KK, 7 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_16x2 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_N2_I1 ++.L_N2_M15: ++ andi I, M, 8 ++ beqz I, .L_N2_M7 ++.L_N2_M8: ++ PTR_SLLI T1, KK, 6 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_8x2 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_N2_M7: ++ andi I, M, 4 ++ beqz I, .L_N2_M3 ++.L_N2_M4: ++ PTR_SLLI T1, KK, 5 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_4x2 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_N2_M3: ++ andi I, M, 2 ++ beqz I, .L_N2_M1 ++.L_N2_M2: ++ PTR_SLLI T1, KK, 4 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_2x2 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_N2_M1: ++ andi I, M, 1 ++ beqz I, .L_N2_M0 ++ PTR_SLLI T1, KK, 3 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_1x2 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_N2_M0: ++ PTR_ADDI KK, KK, -2 ++.L_N4: ++ PTR_SRAI J, N, 2 /* J = bn >> 2 */ ++ beq ZERO, J, .L_N0 ++.align 5 ++.L_J1: ++ PTR_ADDI J, J, -1 ++ move AA, A ++ PTR_SLLI T0, LDC, 2 ++ PTR_SLLI T1, K, 5 ++ PTR_SLLI T2, KK, 5 ++ PTR_SUB B, B, T1 ++ PTR_SUB C, C, T0 ++ PTR_ADD BB, B, T2 ++ move CC, C ++ PTR_SRAI I, M, 4 // M >> 4 ++ beqz I, .L_M15 ++.align 4 ++.L_I1: ++ PTR_SLLI T1, KK, 7 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_16x4 ++ PTR_ADDI I, I, -1 ++ PTR_SLLI T0, K, 7 ++ PTR_ADDI CC, CC, 0x80 // cc += 16 ++ PTR_ADD AA, AA, T0 // aa += 16 * k ++ bnez I, .L_I1 ++.L_M15: ++ andi I, M, 8 ++ beqz I, .L_M7 ++.L_M8: ++ PTR_SLLI T1, KK, 6 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_8x4 ++ PTR_SLLI T0, K, 6 ++ PTR_ADDI CC, CC, 0x40 // cc += 8 ++ PTR_ADD AA, AA, T0 // aa += 8 * k ++.L_M7: ++ andi I, M, 4 ++ beqz I, .L_M3 ++.L_M4: ++ PTR_SLLI T1, KK, 5 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_4x4 ++ PTR_SLLI T0, K, 5 ++ PTR_ADDI CC, CC, 0x20 // cc += 4 ++ PTR_ADD AA, AA, T0 // aa += 4 * k ++.L_M3: ++ andi I, M, 2 ++ beqz I, .L_M1 ++.L_M2: ++ PTR_SLLI T1, KK, 4 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_2x4 ++ PTR_SLLI T0, K, 4 ++ PTR_ADDI CC, CC, 0x10 // cc += 2 ++ PTR_ADD AA, AA, T0 // aa += 2 * k ++.L_M1: ++ andi I, M, 1 ++ beqz I, .L_M0 ++ PTR_SLLI T1, KK, 3 ++ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC ++ PTR_ADD A0, AA, T1 // a0 = aa + kk ++ move B0, BB ++ PTR_SUB L, K, KK // L = K - KK ++ dgemm_dsolve_1x4 ++ PTR_SLLI T0, K, 3 ++ PTR_ADDI CC, CC, 0x08 // cc += 1 ++ PTR_ADD AA, AA, T0 // aa += 1 * k ++.L_M0: ++ PTR_ADDI KK, KK, -4 ++ bnez J, .L_J1 ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S +new file mode 100644 +index 000000000..88b7121d1 +--- /dev/null ++++ b/kernel/loongarch64/dtrsm_kernel_macro.S +@@ -0,0 +1,2147 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++ ++/************** Dgemm Kernel 16x4 ****************/ ++.macro KERNEL2x16x4 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ ++ preld 0, B0, B_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D10, U10, U14, D10 ++ xvfmadd.d D11, U11, U14, D11 ++ ++ preld 0, A0, A_PRE ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 ++ ++ preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D14, U10, U15, D14 ++ xvfmadd.d D15, U11, U15, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U9, A0, 0x20 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvld U10, A0, 0x40 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvld U11, A0, 0x60 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ ++ preld 0, B0, B_PRE ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 ++ ++ preld 0, A0, A_PRE ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ ++ preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x16x4_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ ++ preld 0, B0, B_PRE ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D10, U10, U14, D10 ++ xvfmadd.d D11, U11, U14, D11 ++ ++ preld 0, A0, A_PRE ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 ++ ++ preld 0, A0, A_PRE + 0x40 ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D14, U10, U15, D14 ++ xvfmadd.d D15, U11, U15, D15 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ ++ preld 0, B0, B_PRE ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 ++ ++ preld 0, A0, A_PRE ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ ++ preld 0, A0, A_PRE + 0x40 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 ++.endm ++ ++.macro KERNEL8x16x4 ++.rept 4 ++ KERNEL2x16x4 ++.endr ++.endm ++ ++.macro KERNEL8x16x4_END ++.rept 3 ++ KERNEL2x16x4 ++.endr ++ KERNEL2x16x4_END ++.endm ++ ++.macro KERNEL2x8x4 ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x8x4_END ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ xvfmadd.d D9, U9, U14, D9 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ xvfmadd.d D13, U9, U15, D13 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++.endm ++ ++.macro KERNEL8x8x4 ++.rept 4 ++ KERNEL2x8x4 ++.endr ++.endm ++ ++.macro KERNEL8x8x4_END ++.rept 3 ++ KERNEL2x8x4 ++.endr ++ KERNEL2x8x4_END ++.endm ++ ++.macro KERNEL2x4x4 ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ xvld U8, A0, 0x00 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U13, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ xvldrepl.d U14, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ ++ xvldrepl.d U15, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x4x4_END ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U8, U14, D8 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U8, U15, D12 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D12, U0, U7, D12 ++.endm ++ ++.macro KERNEL8x4x4 ++.rept 4 ++ KERNEL2x4x4 ++.endr ++.endm ++ ++.macro KERNEL8x4x4_END ++.rept 3 ++ KERNEL2x4x4 ++.endr ++ KERNEL2x4x4_END ++.endm ++ ++.macro KERNEL2x2x4 ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 ++ ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U4, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ xvldrepl.d U8, A0, 0x00 ++ xvldrepl.d U9, A0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U12, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x2x4_END ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 ++ ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U4, B0, 0x00 ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++.endm ++ ++.macro KERNEL8x2x4 ++.rept 4 ++ KERNEL2x2x4 ++.endr ++.endm ++ ++.macro KERNEL8x2x4_END ++.rept 3 ++ KERNEL2x2x4 ++.endr ++ KERNEL2x2x4_END ++.endm ++ ++.macro KERNEL2x1x4 ++ xvldrepl.d U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvld U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ xvldrepl.d U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvld U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++.endm ++ ++.macro KERNEL2x1x4_END ++ xvldrepl.d U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvld U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x20 ++ ++ xvfmadd.d D0, U0, U4, D0 ++.endm ++ ++.macro KERNEL8x1x4 ++.rept 4 ++ KERNEL2x1x4 ++.endr ++.endm ++ ++.macro KERNEL8x1x4_END ++.rept 3 ++ KERNEL2x1x4 ++.endr ++ KERNEL2x1x4_END ++.endm ++ ++.macro KERNEL2x16x2 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U9, A0, 0x20 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvld U10, A0, 0x40 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvld U11, A0, 0x60 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++.endm ++ ++.macro KERNEL2x16x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvld U3, A0, 0x60 ++ xvfmadd.d D6, U10, U13, D6 ++ xvfmadd.d D7, U11, U13, D7 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++.endm ++ ++.macro KERNEL8x16x2 ++.rept 4 ++ KERNEL2x16x2 ++.endr ++.endm ++ ++.macro KERNEL8x16x2_END ++.rept 3 ++ KERNEL2x16x2 ++.endr ++ KERNEL2x16x2_END ++.endm ++ ++.macro KERNEL2x8x2 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U9, A0, 0x20 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++.endm ++ ++.macro KERNEL2x8x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D4, U8, U13, D4 ++ xvfmadd.d D5, U9, U13, D5 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++.endm ++ ++.macro KERNEL8x8x2 ++.rept 4 ++ KERNEL2x8x2 ++.endr ++.endm ++ ++.macro KERNEL8x8x2_END ++.rept 3 ++ KERNEL2x8x2 ++ .endr ++ KERNEL2x8x2_END ++.endm ++ ++.macro KERNEL2x4x2 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++.endm ++ ++.macro KERNEL2x4x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm ++ ++.macro KERNEL8x4x2 ++.rept 4 ++ KERNEL2x4x2 ++.endr ++.endm ++ ++.macro KERNEL8x4x2_END ++.rept 3 ++ KERNEL2x4x2 ++.endr ++ KERNEL2x4x2_END ++.endm ++ ++.macro KERNEL2x2x2 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++.endm ++ ++.macro KERNEL2x2x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm ++ ++.macro KERNEL8x2x2 ++.rept 4 ++ KERNEL2x2x2 ++.endr ++.endm ++ ++.macro KERNEL8x2x2_END ++.rept 3 ++ KERNEL2x2x2 ++.endr ++ KERNEL2x2x2_END ++.endm ++ ++.macro KERNEL2x1x2 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++.endm ++ ++.macro KERNEL2x1x2_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D4, U8, U13, D4 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvldrepl.d U5, B0, 0x08 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D4, U0, U5, D4 ++.endm ++ ++.macro KERNEL8x1x2 ++.rept 4 ++ KERNEL2x1x2 ++.endr ++.endm ++ ++.macro KERNEL8x1x2_END ++.rept 3 ++ KERNEL2x1x2 ++.endr ++ KERNEL2x1x2_END ++.endm ++ ++.macro KERNEL2x16x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvld U9, A0, 0x20 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 ++ ++ xvldrepl.d U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++.endm ++ ++.macro KERNEL2x16x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ ++ xvld U1, A0, 0x20 ++ xvfmadd.d D2, U10, U12, D2 ++ xvfmadd.d D3, U11, U12, D3 ++ ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x80 ++ addi.d B0, B0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++.endm ++ ++.macro KERNEL8x16x1 ++.rept 4 ++ KERNEL2x16x1 ++.endr ++.endm ++ ++.macro KERNEL8x16x1_END ++.rept 3 ++ KERNEL2x16x1 ++.endr ++ KERNEL2x16x1_END ++.endm ++ ++.macro KERNEL2x8x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ xvld U1, A0, 0x20 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvld U9, A0, 0x20 ++ xvldrepl.d U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++.endm ++ ++.macro KERNEL2x8x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvfmadd.d D1, U9, U12, D1 ++ xvld U1, A0, 0x20 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x40 ++ addi.d B0, B0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++.endm ++ ++.macro KERNEL8x8x1 ++.rept 4 ++ KERNEL2x8x1 ++.endr ++.endm ++ ++.macro KERNEL8x8x1_END ++.rept 3 ++ KERNEL2x8x1 ++.endr ++ KERNEL2x8x1_END ++.endm ++ ++.macro KERNEL2x4x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++.endm ++ ++.macro KERNEL2x4x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x20 ++ addi.d B0, B0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++.endm ++ ++.macro KERNEL8x4x1 ++.rept 4 ++ KERNEL2x4x1 ++.endr ++.endm ++ ++.macro KERNEL8x4x1_END ++.rept 3 ++ KERNEL2x4x1 ++.endr ++ KERNEL2x4x1_END ++.endm ++ ++.macro KERNEL2x2x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++.endm ++ ++.macro KERNEL2x2x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x10 ++ addi.d B0, B0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++.endm ++ ++.macro KERNEL8x2x1 ++.rept 4 ++ KERNEL2x2x1 ++.endr ++.endm ++ ++.macro KERNEL8x2x1_END ++.rept 3 ++ KERNEL2x2x1 ++.endr ++ KERNEL2x2x1_END ++.endm ++ ++.macro KERNEL2x1x1 ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ xvld U8, A0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvldrepl.d U12, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++.endm ++ ++.macro KERNEL2x1x1_END ++ xvld U0, A0, 0x00 ++ xvfmadd.d D0, U8, U12, D0 ++ xvldrepl.d U4, B0, 0x00 ++ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ xvfmadd.d D0, U0, U4, D0 ++.endm ++ ++.macro KERNEL8x1x1 ++.rept 4 ++ KERNEL2x1x1 ++.endr ++.endm ++ ++.macro KERNEL8x1x1_END ++.rept 3 ++ KERNEL2x1x1 ++.endr ++ KERNEL2x1x1_END ++.endm ++ ++.macro dgemm_16x4 ++.L_dgemm_16x4: // See dgemm_kernel_16x4.S ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ xvfmul.d D6, U2, U5 ++ xvfmul.d D7, U3, U5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U6 ++ xvfmul.d D9, U1, U6 ++ xvfmul.d D10, U2, U6 ++ xvfmul.d D11, U3, U6 ++ ++ xvldrepl.d U7, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U7 ++ xvfmul.d D13, U1, U7 ++ xvfmul.d D14, U2, U7 ++ xvfmul.d D15, U3, U7 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_L7 */ ++ beq ZERO,TL, .L_dgemm_16x4_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x20 ++ ++ beq ZERO, TL, .L_dgemm_16x4_TL1_END ++.align 5 ++.L_dgemm_16x4_TL1: ++ KERNEL8x16x4 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO, TL, .L_dgemm_16x4_TL1 ++.L_dgemm_16x4_TL1_END: ++ KERNEL8x16x4_END ++.L_dgemm_16x4_L7: ++ andi TL, L, 7 ++ beq TL, ZERO, .L_dgemm_16x4_L0 ++.align 5 ++.L_dgemm_16x4_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ xvfmadd.d D10, U2, U6, D10 ++ xvfmadd.d D11, U3, U6, D11 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ xvfmadd.d D14, U2, U7, D14 ++ xvfmadd.d D15, U3, U7, D15 ++ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_16x4_L71 ++.L_dgemm_16x4_L0: ++ // Load C ++ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 ++ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 ++ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 ++ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ ++ U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \ ++ U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \ ++ U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15 ++.endm ++ ++.macro dgemm_1x4 ++.L_dgemm_1x4: // See dgemm_kernel_16x4.S ++ xvldrepl.d U0, A0, 0x00 ++ xvld U4, B0, 0x00 ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x08 ++ PTR_ADDI B0, B0, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M1_L7 */ ++ beq ZERO,TL, .L_dgemm_1x4_M1_L7 ++ xvldrepl.d U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ xvld U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x08 ++ PTR_ADDI B0, B0, 0x20 ++ ++ beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END ++.align 5 ++.L_dgemm_1x4_M1_TL1: ++ KERNEL8x1x4 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_1x4_M1_TL1 ++.L_dgemm_1x4_M1_TL1_END: ++ KERNEL8x1x4_END ++.L_dgemm_1x4_M1_L7: ++ /* if (!(L & 7)) goto L_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_1x4_M1_L0 ++.align 5 ++.L_dgemm_1x4_M1_L71: ++ xvldrepl.d U0, A0, 0x00 ++ xvld U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x08 ++ PTR_ADDI B0, B0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_1x4_M1_L71 ++.L_dgemm_1x4_M1_L0: ++ // Load C ++ fld.d $f0, C0, 0x00 ++ fld.d $f1, C1, 0x00 ++ fld.d $f2, C2, 0x00 ++ fld.d $f3, C3, 0x00 ++ xvinsve0.d U0, U1, 0x01 ++ xvinsve0.d U0, U2, 0x02 ++ xvinsve0.d U0, U3, 0x03 ++ GSUB xvf, d, U0, U0, D0 ++.endm ++ ++.macro dgemm_2x4 ++.L_dgemm_2x4: ++ /* Load 2 * 64 from A0 */ ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 ++ xvld U4, B0, 0x00 ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M2_L7 */ ++ beq ZERO,TL, .L_dgemm_2x4_M2_L7 ++ ++ xvldrepl.d U8, A0, 0x00 ++ xvldrepl.d U9, A0, 0x08 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvld U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x20 ++ ++ beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END ++.align 5 ++.L_dgemm_2x4_M2_TL1: ++ KERNEL8x2x4 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_2x4_M2_TL1 ++.L_dgemm_2x4_M2_TL1_END: ++ KERNEL8x2x4_END ++ ++.L_dgemm_2x4_M2_L7: ++ /* if (!(L & 7)) goto L_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_2x4_M2_L0 ++.align 5 ++.L_dgemm_2x4_M2_L71: ++ xvldrepl.d U0, A0, 0x00 ++ xvldrepl.d U1, A0, 0x08 ++ ++ xvld U4, B0, 0x00 ++ ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_2x4_M2_L71 ++.L_dgemm_2x4_M2_L0: ++ xvpackev.d D4, D1, D0 ++ xvpackod.d D5, D1, D0 ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++ ++ xvpermi.q U0, U2, 0x02 ++ xvpermi.q U1, U3, 0x02 ++ ++ GSUB xvf, d, U0, U0, D4, U1, U1, D5 ++.endm ++ ++.macro dgemm_4x4 ++.L_dgemm_4x4: ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U6 ++ ++ xvldrepl.d U7, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U7 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M4_L7 */ ++ beq ZERO,TL, .L_dgemm_4x4_M4_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x20 ++ ++ beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END ++.align 5 ++.L_dgemm_4x4_M4_TL1: /* TL-- */ ++ KERNEL8x4x4 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_4x4_M4_TL1 ++.L_dgemm_4x4_M4_TL1_END: ++ KERNEL8x4x4_END ++.L_dgemm_4x4_M4_L7: ++ /* if (!(L & 7)) goto L_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_4x4_M4_L0 ++.align 5 ++.L_dgemm_4x4_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U4, B0, 0x08 ++ xvfmadd.d D4, U0, U4, D4 ++ ++ xvldrepl.d U4, B0, 0x10 ++ xvfmadd.d D8, U0, U4, D8 ++ ++ xvldrepl.d U4, B0, 0x18 ++ xvfmadd.d D12, U0, U4, D12 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_4x4_M4_L71 ++ .L_dgemm_4x4_M4_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ /* Load C2 */ ++ xvld U2, C2, 0x00 ++ /* Load C3 */ ++ xvld U3, C3, 0x00 ++ ++ GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12 ++.endm ++ ++.macro dgemm_8x4 ++.L_dgemm_8x4: ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ /* line 3 */ ++ xvfmul.d D8, U0, U6 ++ xvfmul.d D9, U1, U6 ++ ++ xvldrepl.d U7, B0, 0x18 ++ /* line 4 */ ++ xvfmul.d D12, U0, U7 ++ xvfmul.d D13, U1, U7 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M8_L7 */ ++ beq ZERO,TL, .L_dgemm_8x4_M8_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ xvldrepl.d U14, B0, 0x10 ++ xvldrepl.d U15, B0, 0x18 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x20 ++ ++ beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END ++.align 5 ++.L_dgemm_8x4_M8_TL1: /* TL-- */ ++ KERNEL8x8x4 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_8x4_M8_TL1 ++ ++.L_dgemm_8x4_M8_TL1_END: ++ KERNEL8x8x4_END ++ ++.L_dgemm_8x4_M8_L7: ++ /* if (!(L & 7)) goto L_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_8x4_M8_L0 ++.align 5 ++.L_dgemm_8x4_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ xvldrepl.d U6, B0, 0x10 ++ xvfmadd.d D8, U0, U6, D8 ++ xvfmadd.d D9, U1, U6, D9 ++ ++ xvldrepl.d U7, B0, 0x18 ++ xvfmadd.d D12, U0, U7, D12 ++ xvfmadd.d D13, U1, U7, D13 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_8x4_M8_L71 ++.L_dgemm_8x4_M8_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ ++ /* Load C2 */ ++ xvld U4, C2, 0x00 ++ xvld U5, C2, 0x20 ++ ++ /* Load C3 */ ++ xvld U6, C3, 0x00 ++ xvld U7, C3, 0x20 ++ ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1, \ ++ U2, U2, D4, U3, U3, D5, \ ++ U4, U4, D8, U5, U5, D9, \ ++ U6, U6, D12, U7, U7, D13 ++.endm ++ ++.macro dgemm_4x2 ++.L_dgemm_4x2: ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */ ++ beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x10 ++ ++ beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END ++.align 5 ++.L_dgemm_4x2_N3_M4_TL1: /* TL-- */ ++ KERNEL8x4x2 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1 ++.L_dgemm_4x2_N3_M4_TL1_END: ++ KERNEL8x4x2_END ++ ++.L_dgemm_4x2_N3_M4_L7: ++ /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0 ++.align 5 ++.L_dgemm_4x2_N3_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x10 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71 ++ ++.L_dgemm_4x2_N3_M4_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D4 ++.endm ++ ++.macro dgemm_2x2 ++.L_dgemm_2x2: ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */ ++ beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x10 ++ ++ beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END ++.align 5 ++.L_dgemm_2x2_N3_M2_TL1: /* TL-- */ ++ KERNEL8x2x2 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1 ++.L_dgemm_2x2_N3_M2_TL1_END: ++ KERNEL8x2x2_END ++ ++.L_dgemm_2x2_N3_M2_L7: ++ /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0 ++.align 5 ++.L_dgemm_2x2_N3_M2_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x10 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71 ++.L_dgemm_2x2_N3_M2_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ /* Load C1 */ ++ xvld U1, C1, 0x00 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D4 ++.endm ++ ++.macro dgemm_8x2 ++.L_dgemm_8x2: ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */ ++ beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x10 ++ ++ beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END ++.align 5 ++.L_dgemm_8x2_N3_M8_TL1: /* TL-- */ ++ KERNEL8x8x2 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1 ++.L_dgemm_8x2_N3_M8_TL1_END: ++ KERNEL8x8x2_END ++ ++.L_dgemm_8x2_N3_M8_L7: ++ /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0 ++.align 5 ++.L_dgemm_8x2_N3_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x10 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71 ++ ++.L_dgemm_8x2_N3_M8_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ /* Load C1 */ ++ xvld U2, C1, 0x00 ++ xvld U3, C1, 0x20 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5 ++.endm ++ ++.macro dgemm_16x2 ++.L_dgemm_16x2: ++ /* Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ xvldrepl.d U5, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U5 ++ xvfmul.d D5, U1, U5 ++ xvfmul.d D6, U2, U5 ++ xvfmul.d D7, U3, U5 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_L7 */ ++ beq ZERO,TL, .L_dgemm_16x2_N3_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x10 ++ ++ beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END ++.align 5 ++.L_dgemm_16x2_N3_TL1: /* TL-- */ ++ KERNEL8x16x2 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_16x2_N3_TL1 ++.L_dgemm_16x2_N3_TL1_END: ++ KERNEL8x16x2_END ++ ++.L_dgemm_16x2_N3_L7: ++ /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_16x2_N3_L0 ++.align 5 ++.L_dgemm_16x2_N3_L71: ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ xvfmadd.d D5, U1, U5, D5 ++ xvfmadd.d D6, U2, U5, D6 ++ xvfmadd.d D7, U3, U5, D7 ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x10 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_16x2_N3_L71 ++ ++.L_dgemm_16x2_N3_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ /* Load C1 */ ++ xvld U4, C1, 0x00 ++ xvld U5, C1, 0x20 ++ xvld U6, C1, 0x40 ++ xvld U7, C1, 0x60 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ ++ U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7 ++.endm ++ ++.macro dgemm_2x1 ++.L_dgemm_2x1: ++ /* Load 2 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */ ++ beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x08 ++ ++ beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END ++.align 5 ++.L_dgemm_2x1_N1_M2_TL1: /* TL-- */ ++ KERNEL8x2x1 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1 ++.L_dgemm_2x1_N1_M2_TL1_END: ++ KERNEL8x2x1_END ++ ++.L_dgemm_2x1_N1_M2_L7: ++ /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0 ++.align 5 ++.L_dgemm_2x1_N1_M2_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x10 ++ PTR_ADDI B0, B0, 0x08 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71 ++.L_dgemm_2x1_N1_M2_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ GSUB xvf, d, U0, U0, D0 ++.endm ++ ++.macro dgemm_4x1 ++.L_dgemm_4x1: ++ /* Load 4 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */ ++ beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x08 ++ ++ beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END ++.align 5 ++.L_dgemm_4x1_N1_M4_TL1: /* TL-- */ ++ KERNEL8x4x1 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1 ++.L_dgemm_4x1_N1_M4_TL1_END: ++ KERNEL8x4x1_END ++ ++.L_dgemm_4x1_N1_M4_L7: ++ /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0 ++.align 5 ++.L_dgemm_4x1_N1_M4_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x20 ++ PTR_ADDI B0, B0, 0x08 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71 ++.L_dgemm_4x1_N1_M4_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ GSUB xvf, d, U0, U0, D0 ++.endm ++ ++.macro dgemm_8x1 ++.L_dgemm_8x1: ++ /* Load 8 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */ ++ beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x08 ++ ++ beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END ++.align 5 ++.L_dgemm_8x1_N1_M8_TL1: /* TL-- */ ++ KERNEL8x8x1 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1 ++ ++.L_dgemm_8x1_N1_M8_TL1_END: ++ KERNEL8x8x1_END ++ ++.L_dgemm_8x1_N1_M8_L7: ++ /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0 ++.align 5 ++.L_dgemm_8x1_N1_M8_L71: ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x08 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71 ++.L_dgemm_8x1_N1_M8_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1 ++.endm ++ ++.macro dgemm_16x1 ++.L_dgemm_16x1: ++ /* Load 16 * 64 from A0 ++ * U0 = {a3, a2, a1, a0} ++ * U1 = {a7, a6, a5, a4} ++ * U2 = {a11, a10, a9, a8} ++ * U3 = {a15, a14, a13, a12} ++ */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ xvfmul.d D1, U1, U4 ++ xvfmul.d D2, U2, U4 ++ xvfmul.d D3, U3, U4 ++ ++ /* Add stride for A0 and B0 */ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */ ++ beq ZERO,TL, .L_dgemm_16x1_N1_L7 ++ ++ xvld U8, A0, 0x00 ++ xvld U9, A0, 0x20 ++ xvld U10, A0, 0x40 ++ xvld U11, A0, 0x60 ++ ++ PTR_ADDI TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x08 ++ ++ beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END ++.align 5 ++.L_dgemm_16x1_N1_TL1: /* TL-- */ ++ KERNEL8x16x1 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_16x1_N1_TL1 ++.L_dgemm_16x1_N1_TL1_END: ++ KERNEL8x16x1_END ++ ++.L_dgemm_16x1_N1_L7: ++ /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_16x1_N1_L0 ++.align 5 ++.L_dgemm_16x1_N1_L71: ++ /* Load 16 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ xvld U1, A0, 0x20 ++ xvld U2, A0, 0x40 ++ xvld U3, A0, 0x60 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ xvfmadd.d D1, U1, U4, D1 ++ xvfmadd.d D2, U2, U4, D2 ++ xvfmadd.d D3, U3, U4, D3 ++ ++ /* Add stride for A0, B0 */ ++ PTR_ADDI A0, A0, 0x80 ++ PTR_ADDI B0, B0, 0x08 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_16x1_N1_L71 ++.L_dgemm_16x1_N1_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ xvld U1, C0, 0x20 ++ xvld U2, C0, 0x40 ++ xvld U3, C0, 0x60 ++ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3 ++.endm ++ ++.macro dgemm_1x2 ++.L_dgemm_1x2: // See dgemm_kernel_16x4.S ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ xvldrepl.d U4, B0, 0x08 ++ /* line 2 */ ++ xvfmul.d D4, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N3_M1_L7 */ ++ beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ addi.d TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ xvldrepl.d U13, B0, 0x08 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END ++.L_dgemm_1x2_N3_M1_TL1: /* TL-- */ ++ KERNEL8x1x2 ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1 ++.L_dgemm_1x2_N3_M1_TL1_END: ++ KERNEL8x1x2_END ++.L_dgemm_1x2_N3_M1_L7: ++ /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0 ++.L_dgemm_1x2_N3_M1_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ xvldrepl.d U5, B0, 0x08 ++ xvfmadd.d D4, U0, U5, D4 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x10 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71 ++.L_dgemm_1x2_N3_M1_L0: ++ xvld U0, C0, 0x00 ++ xvld U1, C1, 0x00 ++ xvinsve0.d U0, U1, 0x01 ++ xvinsve0.d D0, D4, 0x01 ++ GSUB xvf, d, U0, U0, D0 ++.endm ++ ++.macro dgemm_1x1 ++.L_dgemm_1x1: ++ /* Load 1 * 64 from A0 */ ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ /* line 1 */ ++ xvfmul.d D0, U0, U4 ++ ++ /* Add stride for A0 and B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ /* Reduce L */ ++ addi.d L, L, -1 ++ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M1_L7 */ ++ beq ZERO,TL, .L_N1_M1_L7 ++ ++ xvld U8, A0, 0x00 ++ ++ addi.d TL, TL, -1 ++ ++ xvldrepl.d U12, B0, 0x00 ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ beq ZERO, TL, .L_N1_M1_TL1_END ++.L_N1_M1_TL1: /* TL-- */ ++ KERNEL8x1x1 ++ addi.d TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M1_TL1 ++.L_N1_M1_TL1_END: ++ KERNEL8x1x1_END ++.L_N1_M1_L7: ++ /* if (!(L & 7)) goto L_N1_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M1_L0 ++ ++.L_N1_M1_L71: ++ xvld U0, A0, 0x00 ++ ++ xvldrepl.d U4, B0, 0x00 ++ xvfmadd.d D0, U0, U4, D0 ++ ++ /* Add stride for A0, B0 */ ++ addi.d A0, A0, 0x08 ++ addi.d B0, B0, 0x08 ++ ++ addi.d TL, TL, -1 ++ blt ZERO,TL, .L_N1_M1_L71 ++.L_N1_M1_L0: ++ /* Load C0 */ ++ xvld U0, C0, 0x00 ++ GSUB xvf, d, U0, U0, D0 ++.endm +diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S +new file mode 100644 +index 000000000..694dcdaa9 +--- /dev/null ++++ b/kernel/loongarch64/loongarch64_asm.S +@@ -0,0 +1,430 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++ ++#if __loongarch_grlen == 64 ++#define LA_REG int64_t ++#define REG_SIZE 8 ++#define REG_LOG 3 ++#define PTR_ADDI addi.d ++#define PTR_ADD add.d ++#define PTR_SUB sub.d ++#define PTR_LD ld.d ++#define PTR_ST st.d ++#define PTR_SLLI slli.d ++#define PTR_SRLI srli.d ++#define PTR_SRAI srai.d ++#define PTR_MUL mul.d ++#define PTR_ALSL alsl.d ++#elif __loongarch_grlen == 32 ++#define LA_REG int32_t ++#define REG_SIZE 4 ++#define REG_LOG 2 ++#define PTR_ADDI addi.w ++#define PTR_ADD add.w ++#define PTR_SUB sub.w ++#define PTR_LD ld.w ++#define PTR_ST st.w ++#define PTR_SLLI slli.w ++#define PTR_SRLI srli.w ++#define PTR_SRAI srai.w ++#define PTR_MUL mul.w ++#define PTR_ALSL alsl.w ++#else ++// If neither of the above two conditions is supported, it means this is an early ++// internal toolchain. To ensure maximum compatibility, the following approach is taken: ++#define LA_REG int64_t ++#define REG_SIZE 8 ++#define REG_LOG 3 ++#define PTR_ADDI addi.d ++#define PTR_ADD add.d ++#define PTR_SUB sub.d ++#define PTR_LD ld.d ++#define PTR_ST st.d ++#define PTR_SLLI slli.d ++#define PTR_SRLI srli.d ++#define PTR_SRAI srai.d ++#define PTR_MUL mul.d ++#define PTR_ALSL alsl.d ++#endif ++ ++#if __loongarch_frlen == 64 ++#define FREG_SIZE 8 ++#define FREG_LOG 3 ++#define PTR_FLD fld.d ++#define PTR_FST fst.d ++#elif __loongarch_frlen == 32 ++#define FREG_SIZE 4 ++#define FREG_LOG 2 ++#define PTR_FLD fld.s ++#define PTR_FST fst.s ++#else ++// If neither of the above two conditions is supported, it means this is an early ++// internal toolchain. To ensure maximum compatibility, the following approach is taken: ++#define FREG_SIZE 8 ++#define FREG_LOG 3 ++#define PTR_FLD fld.d ++#define PTR_FST fst.d ++#endif ++ ++// The max registers available to the user which ++// do not need to be preserved across calls. ++// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html ++#define MAX_INT_CALLER_SAVED 17 ++#define MAX_FP_CALLER_SAVED 24 ++ ++.altmacro // Enable alternate macro mode ++ ++.macro push_if_used regs, fregs ++.if \regs > MAX_INT_CALLER_SAVED ++ PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) ++ push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 ++.endif ++.if \fregs > MAX_FP_CALLER_SAVED ++ PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) ++ push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 ++.endif ++.endm // End push_if_used ++.macro pop_if_used regs, fregs ++.if \fregs > MAX_FP_CALLER_SAVED ++ pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 ++ PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG ++.endif ++.if \regs > MAX_INT_CALLER_SAVED ++ pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 ++ PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG ++.endif ++.endm // End pop_if_used ++.macro push_regs from, to ++ PTR_ST $s\()\from, $sp, \from << REG_LOG ++.if \to - \from ++ push_regs %from + 1, \to ++.endif ++.endm // End push_regs ++.macro pop_regs from, to ++ PTR_LD $s\()\from, $sp, \from << REG_LOG ++.if \to - \from ++ pop_regs %from + 1, \to ++.endif ++.endm // End pop_regs ++.macro push_fregs from, to ++ PTR_FST $fs\()\from, $sp, \from << FREG_LOG ++.if \to - \from ++ push_fregs %from + 1, \to ++.endif ++.endm // End push_fregs ++.macro pop_fregs from, to ++ PTR_FLD $fs\()\from, $sp, \from << FREG_LOG ++.if \to - \from ++ pop_fregs %from + 1, \to ++.endif ++.endm // End pop_fregs ++ ++// ++// Instruction Related Macros ++// ++// GLD ++// ++.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg ++.ifeqs "\suf_op", "0" ++ \pre_op\()ld \out, \src, \offset ++.else ++ \pre_op\()ld.\suf_op \out, \src, \offset ++.endif ++.ifnb \more ++ GLD \pre_op, \suf_op, \more ++.endif ++.endm ++ ++// ++// GLD_INC ++// ++.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg ++.ifeqs "\suf_op", "0" ++ \pre_op\()ld \out, \src, \offset ++.else ++ \pre_op\()ld.\suf_op \out, \src, \offset ++.endif ++ PTR_ADDI \src, \src, \inc ++.ifnb \more ++ GLD_INC \pre_op, \suf_op, \inc, \more ++.endif ++.endm ++// ++// GLDX is same as GLD except the stride is a register ++// ++.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg ++.ifeqs "\suf_op", "0" ++ \pre_op\()ldx \out, \src, \offset ++.else ++ \pre_op\()ldx.\suf_op \out, \src, \offset ++.endif ++.ifnb \more ++ GLDX \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GLDREPL ++// ++.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg ++ \pre_op\()ldrepl.\suf_op \out, \src, \offset ++.ifnb \more ++ GLDREPL \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GST ++// ++.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg ++.ifeqs "\suf_op", "0" ++ \pre_op\()st \src, \dst, \offset ++.else ++ \pre_op\()st.\suf_op \src, \dst, \offset ++.endif ++.ifnb \more ++ GST \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GMUL ++// ++.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()mul.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GMUL \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GMADD ++// ++.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg ++ \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 ++.ifnb \more ++ GMADD \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GADD ++// ++.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()add.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GADD \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GADDI ++// ++.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()addi.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GADDI \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GSUB ++// ++.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()sub.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GSUB \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GSLLI ++// ++.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()slli.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GSLLI \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GINSVE0 ++// ++.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()insve0.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GINSVE0 \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GXOR ++// ++.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()xor.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GXOR \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GPERMI ++// ++.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg ++ \pre_op\()permi.\suf_op \out, \in0, \in1 ++.ifnb \more ++ GPERMI \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GNMSUB ++// ++.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg ++ \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 ++.ifnb \more ++ GNMSUB \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GPRELD ++// ++.macro GPRELD in0:req, in1:req, in2:req, more:vararg ++ preld \in0, \in1, \in2 ++.ifnb \more ++ GPRELD \more ++.endif ++.endm ++ ++// ++// Compound instructions ++// ++// GACC: Accumulate the values of vector registers ++// ++.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg ++.ifeqs "\pre_op", "xvf" ++ xvpermi.q \out, \in, 0x01 ++ \pre_op\()add.\suf_op \in, \out, \in ++ xvpackod.d \out, \in, \in ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifeqs "\suf_op", "s" ++ xvpackod.w \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.endif ++.endif ++ ++.ifeqs "\pre_op", "vf" ++ vpackod.d \out, \in, \in ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifeqs "\suf_op", "s" ++ vpackod.w \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.endif ++.endif ++ ++.ifeqs "\pre_op", "xv" ++ xvpermi.q \out, \in, 0x01 ++ \pre_op\()add.\suf_op \in, \out, \in ++ xvpackod.d \out, \in, \in ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "d" ++ xvpackod.w \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "w" ++ xvpackod.h \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "h" ++ xvpackod.b \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.endif ++.endif ++.endif ++.endif ++ ++.ifeqs "\pre_op", "v" ++ vpackod.d \out, \in, \in ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "d" ++ vpackod.w \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "w" ++ vpackod.h \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.ifnc "\suf_op", "h" ++ vpackod.b \in, \out, \out ++ \pre_op\()add.\suf_op \out, \out, \in ++.endif ++.endif ++.endif ++.endif ++ ++.ifnb \more ++ GACC \pre_op, \suf_op, \more ++.endif ++.endm ++// ++// GMOV ++// ++.macro GMOV pre_op:req, out:req, in:req, more:vararg ++ \pre_op\()or.v \out, \in, \in ++.ifnb \more ++ GMOV \pre_op, \more ++.endif ++.endm ++ ++// ++// Media Related Macros ++// ++.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 ++ \pre_op\()ilvl.\suf_op \out0, \in0, \in1 ++ \pre_op\()ilvh.\suf_op \out1, \in0, \in1 ++.endm ++.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 ++ \pre_op\()pickev.\suf_op \out0, \in0, \in1 ++ \pre_op\()pickod.\suf_op \out1, \in0, \in1 ++.endm ++ ++// ++// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, ++// has no pre_op param. 128-bit vector instructions are not supported. ++// ++.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ ++ vt0, vt1 ++ GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 ++ GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 ++ GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 ++ GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 ++.endm ++ ++.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ ++ in0, in1, in2, in3, in4, in5, in6, in7, \ ++ tmp0, tmp1, tmp2, tmp3 ++ GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 ++ GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 ++ GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 ++ GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 ++ ++ GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 ++ GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 ++ GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 ++ GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 ++ ++ GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 ++ ++ GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ ++ \out2, \out6, 0x02, \out3, \out7, 0x02, \ ++ \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ ++ \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 ++.endm +diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S +new file mode 100644 +index 000000000..bd609394e +--- /dev/null ++++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S +@@ -0,0 +1,2348 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/23 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++* 2023/08/23 guxiwei ++* Parameter: ++* SGEMM_DEFAULT_UNROLL_N 8 ++* SGEMM_DEFAULT_UNROLL_M 16 ++* SGEMM_DEFAULT_P 256 ++* SGEMM_DEFAULT_Q 256 ++* SGEMM_DEFAULT_R 1024 ++* A_PRE 1024 ++* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. ++* ++* ++* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: ++* 1 thread: 71.7 GFLOPS ++* 2 threads: 142.6 GFLOPS ++* 3 threads: 211.5 GFLOPS ++* 4 threads: 265.0 GFLOPS ++*********************************************************************/ ++ ++/* Function parameters */ ++#define M $r4 // param 1: bm ++#define N $r5 // param 2: bn ++#define K $r6 // param 3: bk ++#define ALPHA $f0 // param 4: alpha ++#define A $r7 // param 5: ba ++#define B $r8 // param 6: bb ++#define C $r9 // param 7: bc ++#define LDC $r10 // param 8: ldc ++ ++#ifdef TRMMKERNEL ++#define OFFSET $r11 // param 9: offset ++#endif ++#define OFF $r12 ++ ++/* Cycle control parameters */ ++#define I $r13 ++#define J $r14 ++#define L $r15 ++#define TL $r16 ++/* Matrix address */ ++#define A0 $r17 ++#define B0 $r18 ++#define C0 $r19 ++#define C1 $r20 ++#define C2 $r23 ++#define C3 $r24 ++#define C4 $r25 ++#define C5 $r26 ++#define C6 $r27 ++#define C7 $r28 ++#define T0 $r29 ++#define T1 $r30 ++#undef ZERO ++#define ZERO $r0 ++ ++/* LASX Vectors ++ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. ++ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. ++ * Use D0 to D15 to store intermediate values of the computation. ++ * Use VALPHA to store the broadcast value of alpha ++ */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define X0 $xr2 ++#define X1 $xr3 ++#define X2 $xr4 ++#define X3 $xr5 ++#define X4 $xr6 ++#define X5 $xr7 ++#define X6 $xr8 ++#define X7 $xr9 ++#define D0 $xr10 ++#define D1 $xr11 ++#define D2 $xr12 ++#define D3 $xr13 ++#define D4 $xr14 ++#define D5 $xr15 ++#define D6 $xr16 ++#define D7 $xr17 ++#define D8 $xr18 ++#define D9 $xr19 ++#define D10 $xr20 ++#define D11 $xr21 ++#define D12 $xr22 ++#define D13 $xr23 ++#define D14 $xr24 ++#define D15 $xr25 ++#define VALPHA $xr26 ++ ++/* Prefetch interval */ ++#define A_PRE 0x400 ++#define B_PRE 0x100 ++ ++// Loops outline: ++// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ ++// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ ++// | | .L_M16_TL1 | | ++// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | ++// | | .L_M16_L71 | | ++// | | .L_M16_L0 ---------------- | ++// | .L_M8 | ++// | | .L_M8_TL1 | | ++// | | .L_M8_L7 | KERNEK8x8 | ++// | | .L_M8_L71 | | ++// | | .L_M8_L0 | | ++// | .L_M4 | ++// | | .L_M4_TL1 | | ++// | | .L_M4_L7 | KERNEK4x8 | ++// | | .L_M4_L71 | | ++// | | .L_M4_L0 | | ++// | .L_M2 | ++// | | .L_M2_TL1 | | ++// | | .L_M2_L7 | KERNEK2x8 | ++// | | .L_M2_L71 | | ++// | | .L_M2_L0 | | ++// | .L_M1 | ++// | | .L_M1_TL1 | | ++// | | .L_M1_L7 | KERNEK1x8 | ++// | | .L_M1_L71 | | ++// | | .L_M1_L0 | | ++// | .L_M0------------------------------------------------------------------------------------------ ++// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ ++// .L_N4 ++// | .L_N4_M16 <--------------------- ++// | | .L_N4_M16_TL1 | ++// | | .L_N4_M16_L7 | KERNEL16x4 ++// | | .L_N4_M16_L71 | ++// | | .L_N4_M16_L0 ---------------- ++// | .L_N4_M8 ++// | | .L_N4_M8_TL1 | ++// | | .L_N4_M8_L7 | KERNEL8x4 ++// | | .L_N4_M8_L71 | ++// | | .L_N4_M8_L0 | ++// | .L_N4_M4 ++// | | .L_N4_M4_TL1 | ++// | | .L_N4_M4_L7 | KERNEL4x4 ++// | | .L_N4_M4_L71 | ++// | | .L_N4_M4_L0 | ++// | .L_N4_M2 ++// | | .L_N4_M2_TL1 | ++// | | .L_N4_M2_L7 | KERNEL2x4 ++// | | .L_N4_M2_L71 | ++// | | .L_N4_M2_L0 | ++// | .L_N4_M1 ++// | | .L_N4_M1_TL1 | ++// | | .L_N4_M1_L7 | KERNEL1x4 ++// | | .L_N4_M1_L71 | ++// | | .L_N4_M1_L0 | ++// | .L_N4_M0 ++// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ ++// .L_N2 ++// | .L_N2_M16 <--------------------- ++// | | .L_N2_M16_TL1 | ++// | | .L_N2_M16_L7 | KERNEL16x2 ++// | | .L_N2_M16_L71 | ++// | | .L_N2_M16_L0 ---------------- ++// | .L_N2_M8 ++// | | .L_N2_M8_TL1 | ++// | | .L_N2_M8_L7 | KERNEL8x2 ++// | | .L_N2_M8_L71 | ++// | | .L_N2_M8_L0 | ++// | .L_N2_M4 ++// | | .L_N2_M4_TL1 | ++// | | .L_N2_M4_L7 | KERNEL4x2 ++// | | .L_N2_M4_L71 | ++// | | .L_N2_M4_L0 | ++// | .L_N2_M2 ++// | | .L_N2_M2_TL1 | ++// | | .L_N2_M2_L7 | KERNEL2x2 ++// | | .L_N2_M2_L71 | ++// | | .L_N2_M2_L0 | ++// | .L_N2_M1 ++// | | .L_N2_M1_TL1 | ++// | | .L_N2_M1_L7 | KERNEL1x2 ++// | | .L_N2_M1_L71 | ++// | | .L_N2_M1_L0 | ++// | .L_N2_M0 ++// .L_N1 ++// | .L_N1_M16 <--------------------- ++// | | .L_N1_M16_TL1 | ++// | | .L_N1_M16_L7 | KERNEL16x1 ++// | | .L_N1_M16_L71 | ++// | | .L_N1_M16_L0 ---------------- ++// | .L_N1_M8 ++// | | .L_N1_M8_TL1 | ++// | | .L_N1_M8_L7 | KERNEL8x1 ++// | | .L_N1_M8_L71 | ++// | | .L_N1_M8_L0 | ++// | .L_N1_M4 ++// | | .L_N1_M4_TL1 | ++// | | .L_N1_M4_L7 | KERNEL4x1 ++// | | .L_N1_M4_L71 | ++// | | .L_N1_M4_L0 | ++// | .L_N1_M2 ++// | | .L_N1_M2_TL1 | ++// | | .L_N1_M2_L7 | KERNEL2x1 ++// | | .L_N1_M2_L71 | ++// | | .L_N1_M2_L0 | ++// | .L_N1_M1 ++// | | .L_N1_M1_TL1 | ++// | | .L_N1_M1_L7 | KERNEL1x1 ++// | | .L_N1_M1_L71 | ++// | | .L_N1_M1_L0 | ++// | .L_N1_M0 ++// .L_N0 ++ ++/*************** sgemm_kernel_macros ***************/ ++.macro KERNEL1x16x8_START ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMUL xvf, s, D0, U0, X0, D1, U1, X0 ++ preld 0, C0, 0x00 ++ GMUL xvf, s, D2, U0, X1, D3, U1, X1 ++ preld 0, C1, 0x00 ++ GMUL xvf, s, D4, U0, X2, D5, U1, X2 ++ preld 0, C2, 0x00 ++ GMUL xvf, s, D6, U0, X3, D7, U1, X3 ++ preld 0, C3, 0x00 ++ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C ++ GMUL xvf, s, D8, U0, X4, D9, U1, X4 ++ preld 0, C4, 0x00 ++ GMUL xvf, s, D10, U0, X5, D11, U1, X5 ++ preld 0, C5, 0x00 ++ GMUL xvf, s, D12, U0, X6, D13, U1, X6 ++ preld 0, C6, 0x00 ++ GMUL xvf, s, D14, U0, X7, D15, U1, X7 ++ preld 0, C7, 0x00 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x20 ++.endm ++ ++.macro KERNEL1x16x8 ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ ++ D2, U0, X1, D2, D3, U1, X1, D3 ++ preld 0, A0, A_PRE ++ GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ ++ D6, U0, X3, D6, D7, U1, X3 D7 ++ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C ++ GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ ++ D10, U0, X5, D10, D11, U1, X5, D11 ++ //preld 0, B0, B_PRE ++ GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ ++ D14, U0, X7, D14, D15, U1, X7 D15 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x20 ++.endm ++ ++.macro KERNEL8x16x8 ++.rept 8 ++ KERNEL1x16x8 ++.endr ++.endm ++ ++.macro SAVE16x8 ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ ++ D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ ++ D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ ++ D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA ++#else ++ /* Load C0 */ ++ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 ++ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 ++ /* Load C1 */ ++ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 ++ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 ++ /* Load C2 */ ++ GLD xv, , X4, C2, 0x00, X5, C2, 0x20 ++ GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 ++ /* Load C3 */ ++ GLD xv, , X6, C3, 0x00, X7, C3, 0x20 ++ GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 ++ /* Load C4 */ ++ GLD xv, , X0, C4, 0x00, X1, C4, 0x20 ++ GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 ++ /* Load C5 */ ++ GLD xv, , X2, C5, 0x00, X3, C5, 0x20 ++ GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 ++ /* Load C6 */ ++ GLD xv, , X4, C6, 0x00, X5, C6, 0x20 ++ GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 ++ /* Load C7 */ ++ GLD xv, , X6, C7, 0x00, X7, C7, 0x20 ++ GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 ++#endif // #if defined(TRMMKERNEL) ++ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ ++ D2, C1, 0x00, D3, C1, 0x20, \ ++ D4, C2, 0x00, D5, C2, 0x20, \ ++ D6, C3, 0x00, D7, C3, 0x20, \ ++ D8, C4, 0x00, D9, C4, 0x20, \ ++ D10, C5, 0x00, D11, C5, 0x20, \ ++ D12, C6, 0x00, D13, C6, 0x20, \ ++ D14, C7, 0x00, D15, C7, 0x20 ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ ++ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ ++ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 ++#else ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ ++ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 ++#endif ++.endm ++ ++// m = 8, 4, 2, 1 ++// stride = 0x20, 0x10, 0x08, 0x04 ++.macro KERNEL1xMx8_START m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ ++ D4, U0, X2, D6, U0, X3 ++ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C ++ GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ ++ D12, U0, X6, D14, U0, X7 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x20 ++.endm ++ ++.macro KERNEL1xMx8 m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ ++ D4, U0, X2, D4, D6, U0, X3, D6 ++ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C ++ GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ ++ D12, U0, X6, D12, D14, U0, X7, D14 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x20 ++.endm ++ ++.macro KERNEL8xMx8 m, stride ++.rept 8 ++ KERNEL1xMx8 \m, \stride ++.endr ++.endm ++ ++.macro SAVEMx8 m, stride ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ ++ D4, D4, VALPHA, D6, D6, VALPHA, \ ++ D8, D8, VALPHA, D10, D10, VALPHA, \ ++ D12, D12, VALPHA, D14, D14, VALPHA ++#else ++ /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ ++ .if \m == 8 ++ GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 ++ .elseif \m == 4 ++ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 ++.elseif \m == 2 ++ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 ++.elseif \m == 1 ++ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 ++ .endif ++ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ ++ D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 ++.if \m == 8 ++ GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 ++.elseif \m == 2 ++ GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 ++.elseif \m == 1 ++ GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 ++.endif ++ GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ ++ D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 ++#endif // #if defined(TRMMKERNEL) ++.if \m == 8 ++ GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ ++ D4, C2, 0x00, D6, C3, 0x00, \ ++ D8, C4, 0x00, D10, C5, 0x00, \ ++ D12, C6, 0x00, D14, C7, 0x00 ++.elseif \m == 4 ++ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ ++ $vr14, C2, 0x00, $vr16, C3, 0x00, \ ++ $vr18, C4, 0x00, $vr20, C5, 0x00, \ ++ $vr22, C6, 0x00, $vr24, C7, 0x00 ++.elseif \m == 2 ++ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ ++ $f14, C2, 0x00, $f16, C3, 0x00, \ ++ $f18, C4, 0x00, $f20, C5, 0x00, \ ++ $f22, C6, 0x00, $f24, C7, 0x00 ++.elseif \m == 1 ++ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ ++ $f14, C2, 0x00, $f16, C3, 0x00, \ ++ $f18, C4, 0x00, $f20, C5, 0x00, \ ++ $f22, C6, 0x00, $f24, C7, 0x00 ++.endif ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ ++ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ ++ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride ++#else ++ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ ++ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride ++#endif ++.endm ++ ++.macro KERNEL1x16x4_START ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ ++ D2, U0, X1, D3, U1, X1, \ ++ D4, U0, X2, D5, U1, X2, \ ++ D6, U0, X3, D7, U1, X3 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x10 ++.endm ++ ++.macro KERNEL1x16x4 ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ ++ D2, U0, X1, D2, D3, U1, X1, D3, \ ++ D4, U0, X2, D4, D5, U1, X2, D5, \ ++ D6, U0, X3, D6, D7, U1, X3 D7 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x10 ++.endm ++ ++.macro KERNEL8x16x4 ++.rept 8 ++ KERNEL1x16x4 ++.endr ++.endm ++ ++.macro SAVE16x4 ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ ++ D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA ++#else ++ /* Load C0 */ ++ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 ++ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 ++ /* Load C1 */ ++ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 ++ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 ++ /* Load C2 */ ++ GLD xv, , X4, C2, 0x00, X5, C2, 0x20 ++ GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 ++ /* Load C3 */ ++ GLD xv, , X6, C3, 0x00, X7, C3, 0x20 ++ GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 ++#endif // #if defined(TRMMKERNEL) ++ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ ++ D2, C1, 0x00, D3, C1, 0x20, \ ++ D4, C2, 0x00, D5, C2, 0x20, \ ++ D6, C3, 0x00, D7, C3, 0x20 ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 ++#else ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 ++#endif ++.endm ++ ++// m = 8, 4, 2, 1 ++// stride = 0x20, 0x10, 0x08, 0x04 ++.macro KERNEL1xMx4_START m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ ++ D4, U0, X2, D6, U0, X3 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x10 ++.endm ++ ++.macro KERNEL1xMx4 m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C ++ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ ++ D4, U0, X2, D4, D6, U0, X3, D6 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x10 ++.endm ++ ++.macro KERNEL8xMx4 m, stride ++.rept 8 ++ KERNEL1xMx4 \m, \stride ++.endr ++.endm ++ ++.macro SAVEMx4 m, stride ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ ++ D4, D4, VALPHA, D6, D6, VALPHA ++#else ++ /* Load C0, C1, C2, C3 */ ++ .if \m == 8 ++ GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 ++ .elseif \m == 4 ++ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 ++.elseif \m == 2 ++ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 ++.elseif \m == 1 ++ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 ++ .endif ++ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ ++ D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 ++#endif // #if defined(TRMMKERNEL) ++.if \m == 8 ++ GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ ++ D4, C2, 0x00, D6, C3, 0x00 ++.elseif \m == 4 ++ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ ++ $vr14, C2, 0x00, $vr16, C3, 0x00 ++.elseif \m == 2 ++ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ ++ $f14, C2, 0x00, $f16, C3, 0x00 ++.elseif \m == 1 ++ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ ++ $f14, C2, 0x00, $f16, C3, 0x00 ++.endif ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride ++#else ++ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride ++#endif ++.endm ++ ++.macro KERNEL1x16x2_START ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 ++ GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ ++ D2, U0, X1, D3, U1, X1 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x08 ++.endm ++ ++.macro KERNEL1x16x2 ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 ++ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ ++ D2, U0, X1, D2, D3, U1, X1, D3 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x08 ++.endm ++ ++.macro KERNEL8x16x2 ++.rept 8 ++ KERNEL1x16x2 ++.endr ++.endm ++ ++.macro SAVE16x2 ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA ++#else ++ /* Load C0 */ ++ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 ++ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 ++ /* Load C1 */ ++ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 ++ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 ++#endif // #if defined(TRMMKERNEL) ++ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ ++ D2, C1, 0x00, D3, C1, 0x20 ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40 ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, 0x40, C1, C1, 0x40 ++#else ++ GADDI , d, C0, C0, 0x40, C1, C1, 0x40 ++#endif ++.endm ++ ++// m = 8, 4, 2, 1 ++// stride = 0x20, 0x10, 0x08, 0x04 ++.macro KERNEL1xMx2_START m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 ++ GMUL xvf, s, D0, U0, X0, D2, U0, X1 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x08 ++.endm ++ ++.macro KERNEL1xMx2 m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 ++ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x08 ++.endm ++ ++.macro KERNEL8xMx2 m, stride ++.rept 8 ++ KERNEL1xMx2 \m, \stride ++.endr ++.endm ++ ++.macro SAVEMx2 m, stride ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA ++#else ++ /* Load C0, C1 */ ++ .if \m == 8 ++ GLD xv, , X0, C0, 0x00, X2, C1, 0x00 ++ .elseif \m == 4 ++ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 ++.elseif \m == 2 ++ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 ++.elseif \m == 1 ++ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 ++ .endif ++ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 ++#endif // #if defined(TRMMKERNEL) ++.if \m == 8 ++ GST xv, , D0, C0, 0x00, D2, C1, 0x00 ++.elseif \m == 4 ++ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 ++.elseif \m == 2 ++ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 ++.elseif \m == 1 ++ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 ++.endif ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, \stride, C1, C1, \stride ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, \stride, C1, C1, \stride ++#else ++ GADDI , d, C0, C0, \stride, C1, C1, \stride ++#endif ++.endm ++ ++.macro KERNEL1x16x1_START ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ GLDREPL xv, w, X0, B0, 0x00 ++ GMUL xvf, s, D0, U0, X0, D1, U1, X0 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x04 ++.endm ++ ++.macro KERNEL1x16x1 ++ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 ++ GLDREPL xv, w, X0, B0, 0x00 ++ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 ++ PTR_ADDI A0, A0, 0x40 ++ PTR_ADDI B0, B0, 0x04 ++.endm ++ ++.macro KERNEL8x16x1 ++.rept 8 ++ KERNEL1x16x1 ++.endr ++.endm ++ ++.macro SAVE16x1 ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA ++#else ++ /* Load C0 */ ++ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 ++ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 ++#endif // #if defined(TRMMKERNEL) ++ GST xv, , D0, C0, 0x00, D1, C0, 0x20 ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, 0x40 ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, 0x40 ++#else ++ GADDI , d, C0, C0, 0x40 ++#endif ++.endm ++ ++// m = 8, 4, 2, 1 ++// stride = 0x20, 0x10, 0x08, 0x04 ++.macro KERNEL1xMx1_START m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00 ++ GMUL xvf, s, D0, U0, X0 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x04 ++.endm ++ ++.macro KERNEL1xMx1 m, stride ++.if \m == 8 ++ GLD xv, , U0, A0, 0x00 ++.elseif \m == 4 ++ GLD v, , $vr0, A0, 0x00 ++.elseif \m ==2 ++ GLD f, d, $f0, A0, 0x00 ++.elseif \m ==1 ++ GLD f, s, $f0, A0, 0x00 ++.endif ++ GLDREPL xv, w, X0, B0, 0x00 ++ GMADD xvf, s, D0, U0, X0, D0 ++ PTR_ADDI A0, A0, \stride ++ PTR_ADDI B0, B0, 0x04 ++.endm ++ ++.macro KERNEL8xMx1 m, stride ++.rept 8 ++ KERNEL1xMx1 \m, \stride ++.endr ++.endm ++ ++.macro SAVEMx1 m, stride ++#if defined(TRMMKERNEL) ++ GMUL xvf, s, D0, D0, VALPHA ++#else ++ /* Load C0, C1 */ ++ .if \m == 8 ++ GLD xv, , X0, C0, 0x00 ++ .elseif \m == 4 ++ GLD v, , $vr2, C0, 0x00 ++.elseif \m == 2 ++ GLD f, d, $f2, C0, 0x00 ++.elseif \m == 1 ++ GLD f, s, $f2, C0, 0x00 ++ .endif ++ GMADD xvf, s, D0, D0, VALPHA, X0 ++#endif // #if defined(TRMMKERNEL) ++.if \m == 8 ++ GST xv, , D0, C0, 0x00 ++.elseif \m == 4 ++ GST v, , $vr10, C0, 0x00 ++.elseif \m == 2 ++ GST f, d, $f10, C0, 0x00 ++.elseif \m == 1 ++ GST f, s, $f10, C0, 0x00 ++.endif ++#if __loongarch_grlen == 64 ++ GADDI , d, C0, C0, \stride ++#elif __loongarch_grlen == 32 ++ GADDI , w, C0, C0, \stride ++#else ++ GADDI , d, C0, C0, \stride ++#endif ++.endm ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ xvreplve0.w VALPHA, $xr0 ++#if defined (TRMMKERNEL) && !defined(LEFT) ++ PTR_SUB OFF, ZERO, OFFSET ++#else ++ xor OFF, OFF, OFF ++#endif ++ /* if (!(N >> 3)) goto L_N7 */ ++ PTR_SRAI J, N, 3 /* J = bn >> 3 */ ++ andi N, N, 0x07 ++ beq ZERO, J, .L_N7 ++.L_N8: /* J -- */ ++ move C0, C ++ move A0, A ++ PTR_SLLI T0, LDC, 2 ++ PTR_ADDI J, J, -1 /* J-- */ ++#if __loongarch_grlen == 64 ++ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ ++ C6, C5, T0, C7, C6, T0 ++#elif __loongarch_grlen == 32 ++ GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ ++ C6, C5, T0, C7, C6, T0 ++#else ++ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ ++ C6, C5, T0, C7, C6, T0 ++#endif ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ /* if (!(M >> 4)) goto L_M8 */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_M8 ++.align 5 ++.L_M16: /* I-- */ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x06 ++ PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ ++#endif ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 16 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 8 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1x16x8_START ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M16_L7 */ ++ beq ZERO,TL, .L_M16_L7 ++.align 5 ++.L_M16_TL1: ++ KERNEL8x16x8 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M16_TL1 ++.L_M16_L7: ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M16_L0 ++.align 5 ++.L_M16_L71: ++ KERNEL1x16x8 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M16_L71 ++.L_M16_L0: ++ SAVE16x8 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI L, L, -16 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, L, -8 ++#endif ++ PTR_SLLI T0, L, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ PTR_ADDI I, I, -1 /* I-- */ ++ blt ZERO,I, .L_M16 ++.L_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_M4 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ ++#endif ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 8 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 8 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif // #if defined(TRMMKERNEL) ++ KERNEL1xMx8_START 8, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M8_L7 */ ++ beq ZERO,TL, .L_M8_L7 ++.align 5 ++.L_M8_TL1: ++ KERNEL8xMx8 8, 0x20 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M8_TL1 ++.L_M8_L7: ++ /* if (!(L & 7)) goto L_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M8_L0 ++.align 5 ++.L_M8_L71: ++ KERNEL1xMx8 8, 0x20 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M8_L71 ++.L_M8_L0: ++ SAVEMx8 8, 0x20 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI L, L, -8 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, L, -8 ++#endif ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_M2 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 4 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 8 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx8_START 4, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M4_L7 */ ++ beq ZERO,TL, .L_M4_L7 ++.align 5 ++.L_M4_TL1: ++ KERNEL8xMx8 4, 0x10 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M4_TL1 ++.L_M4_L7: ++ /* if (!(L & 7)) goto L_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M4_L0 ++.L_M4_L71: ++ KERNEL1xMx8 4, 0x10 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M4_L71 ++.L_M4_L0: ++ SAVEMx8 4, 0x10 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI L, L, -4 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, L, -8 ++#endif ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 2 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 8 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx8_START 2, 0x08 ++ ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M2_L7 */ ++ beq ZERO,TL, .L_M2_L7 ++.align 5 ++.L_M2_TL1: ++ KERNEL8xMx8 2, 0x08 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M2_TL1 ++.L_M2_L7: ++ /* if (!(L & 7)) goto L_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M2_L0 ++.align 5 ++.L_M2_L71: ++ KERNEL1xMx8 2, 0x08 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M2_L71 ++.L_M2_L0: ++ SAVEMx8 2, 0x08 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI L, L, -2 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, L, -8 ++#endif ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 1 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 8 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx8_START 1, 0x04 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_M1_L7 */ ++ beq ZERO,TL, .L_M1_L7 ++.align 5 ++.L_M1_TL1: ++ KERNEL8xMx8 1, 0x04 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_M1_TL1 ++.L_M1_L7: ++ /* if (!(L & 7)) goto L_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_M1_L0 ++.align 5 ++.L_M1_L71: ++ KERNEL1xMx8 1, 0x04 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_M1_L71 ++.L_M1_L0: ++ SAVEMx8 1, 0x04 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI L, L, -1 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, L, -8 ++#endif ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ /* number of values in A */ ++ PTR_ADDI OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++.L_M0: ++ /* Add stride for B and C ++ * B += (K * 32) ++ * C += (LDC * 32) ++ */ ++ PTR_SLLI T0, K, 5 ++ PTR_SLLI T1, LDC, 5 ++ PTR_ADD B, B, T0 ++ PTR_ADD C, C, T1 ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ ++#endif ++ blt ZERO, J, .L_N8 ++ ++.L_N7: ++ andi J, N, 4 ++ beq ZERO, J, .L_N3 ++.L_N4: ++ move C0, C ++ move A0, A ++ PTR_SLLI T0, LDC, 2 ++#if __loongarch_grlen == 64 ++ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 ++#elif __loongarch_grlen == 32 ++ GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 ++#else ++ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 ++#endif ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_N4_M8 */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N4_M8 ++.align 5 ++.L_N4_M16: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x06 ++ PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 16 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1x16x4_START ++ ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N4_L7 */ ++ beq ZERO,TL, .L_N4_M16_L7 ++.align 5 ++.L_N4_M16_TL1: /* TL-- */ ++ KERNEL8x16x4 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N4_M16_TL1 ++.L_N4_M16_L7: ++ /* if (!(L & 7)) goto L_N4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N4_M16_L0 ++.align 5 ++.L_N4_M16_L71: ++ KERNEL1x16x4 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N4_M16_L71 ++.L_N4_M16_L0: ++ SAVE16x4 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -16 ++#else ++ PTR_ADDI L, L, -4 ++#endif ++ PTR_SLLI T0, L, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ PTR_ADDI I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N4_M16 ++.L_N4_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N4_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_N4_M4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 8 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx4_START 8, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N4_M8_L7 */ ++ beq ZERO,TL, .L_N4_M8_L7 ++.align 5 ++.L_N4_M8_TL1: /* TL-- */ ++ KERNEL8xMx4 8, 0x20 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N4_M8_TL1 ++.L_N4_M8_L7: ++ /* if (!(L & 7)) goto L_N4_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N4_M8_L0 ++.align 5 ++.L_N4_M8_L71: ++ KERNEL1xMx4 8, 0x20 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N4_M8_L71 ++.L_N4_M8_L0: ++ SAVEMx4 8, 0x20 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -8 ++#else ++ PTR_ADDI L, L, -4 ++#endif ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N4_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N4_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 4 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx4_START 4, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N4_M4_L7 */ ++ beq ZERO,TL, .L_N4_M4_L7 ++.align 5 ++.L_N4_M4_TL1: /* TL-- */ ++ KERNEL8xMx4 4, 0x10 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N4_M4_TL1 ++.L_N4_M4_L7: ++ /* if (!(L & 7)) goto L_N4_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N4_M4_L0 ++.align 5 ++.L_N4_M4_L71: ++ KERNEL1xMx4 4, 0x10 ++ ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N4_M4_L71 ++.L_N4_M4_L0: ++ SAVEMx4 4, 0x10 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -4 ++#else ++ PTR_ADDI L, L, -4 ++#endif ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N4_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_N4_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 2 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx4_START 2, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N4_M2_L7 */ ++ beq ZERO,TL, .L_N4_M2_L7 ++.align 5 ++.L_N4_M2_TL1: /* TL-- */ ++ KERNEL8xMx4 2, 0x08 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N4_M2_TL1 ++.L_N4_M2_L7: ++ /* if (!(L & 7)) goto L_N4_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N4_M2_L0 ++.align 5 ++.L_N4_M2_L71: ++ KERNEL1xMx4 2, 0x08 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N4_M2_L71 ++.L_N4_M2_L0: ++ SAVEMx4 2, 0x08 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -2 ++#else ++ PTR_ADDI L, L, -4 ++#endif ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N4_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_N4_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 1 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 4 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx4_START 1, 0x04 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N4_M1_L7 */ ++ beq ZERO,TL, .L_N4_M1_L7 ++.align 5 ++.L_N4_M1_TL1: /* TL-- */ ++ KERNEL8xMx4 1, 0x04 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N4_M1_TL1 ++.L_N4_M1_L7: ++ /* if (!(L & 7)) goto L_N4_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N4_M1_L0 ++.align 5 ++.L_N4_M1_L71: ++ KERNEL1xMx4 1, 0x04 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N4_M1_L71 ++.L_N4_M1_L0: ++ SAVEMx4 1, 0x04 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -1 ++#else ++ PTR_ADDI L, L, -4 ++#endif ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N4_M0: ++ /* Add stride for B and C ++ * B += 4 * K ++ * C += 4 * LDC ++ */ ++ PTR_SLLI T0, K, 4 ++ PTR_SLLI T1, LDC, 4 ++ PTR_ADD B, B, T0 ++ PTR_ADD C, C, T1 ++ ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ PTR_ADDI OFF, OFF, 0x04 ++#endif ++ /* We must reinit I */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++.L_N3: ++ andi J, N, 2 ++ beq ZERO, J, .L_N1 ++ ++.L_N2: ++ move C0, C ++ move A0, A ++ PTR_SLLI T0, LDC, 2 ++ PTR_ADD C1, C0, T0 ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ ++ /* if (!(M >> 4)) goto L_N2_M8 */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N2_M8 ++.align 5 ++.L_N2_M16: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 16 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1x16x2_START ++ ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N2_M16_L7 */ ++ beq ZERO,TL, .L_N2_M16_L7 ++.align 5 ++.L_N2_M16_TL1: /* TL-- */ ++ KERNEL8x16x2 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N2_M16_TL1 ++.L_N2_M16_L7: ++ /* if (!(L & 7)) goto L_N2_M16_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N2_M16_L0 ++.align 5 ++.L_N2_M16_L71: ++ KERNEL1x16x2 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N2_M16_L71 ++.L_N2_M16_L0: ++ SAVE16x2 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -16 ++#else ++ PTR_ADDI L, L, -2 ++#endif ++ PTR_SLLI T0, L, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ PTR_ADDI I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N2_M16 ++.L_N2_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N2_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_N2_M4 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 8 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx2_START 8, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N2_M8_L7 */ ++ beq ZERO,TL, .L_N2_M8_L7 ++.align 5 ++.L_N2_M8_TL1: /* TL-- */ ++ KERNEL8xMx2 8, 0x20 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N2_M8_TL1 ++.L_N2_M8_L7: ++ /* if (!(L & 7)) goto L_N2_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N2_M8_L0 ++.align 5 ++.L_N2_M8_L71: ++ KERNEL1xMx2 8, 0x20 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N2_M8_L71 ++.L_N2_M8_L0: ++ SAVEMx2 8, 0x20 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -8 ++#else ++ PTR_ADDI L, L, -2 ++#endif ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N2_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N2_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 4 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx2_START 4, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N2_M4_L7 */ ++ beq ZERO,TL, .L_N2_M4_L7 ++.align 5 ++.L_N2_M4_TL1: /* TL-- */ ++ KERNEL8xMx2 4, 0x10 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N2_M4_TL1 ++.L_N2_M4_L7: ++ /* if (!(L & 7)) goto L_N2_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N2_M4_L0 ++.align 5 ++.L_N2_M4_L71: ++ KERNEL1xMx2 4, 0x10 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N2_M4_L71 ++.L_N2_M4_L0: ++ SAVEMx2 4, 0x10 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -4 ++#else ++ PTR_ADDI L, L, -2 ++#endif ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N2_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_N2_M1 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 2 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx2_START 2, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N2_M2_L7 */ ++ beq ZERO,TL, .L_N2_M2_L7 ++.align 5 ++.L_N2_M2_TL1: /* TL-- */ ++ KERNEL8xMx2 2, 0x08 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N2_M2_TL1 ++.L_N2_M2_L7: ++ /* if (!(L & 7)) goto L_N2_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N2_M2_L0 ++.align 5 ++.L_N2_M2_L71: ++ KERNEL1xMx2 2, 0x08 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N2_M2_L71 ++.L_N2_M2_L0: ++ SAVEMx2 2, 0x08 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -2 ++#else ++ PTR_ADDI L, L, -2 ++#endif ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N2_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_N2_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 1 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 2 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx2_START 1, 0x04 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N2_M1_L7 */ ++ beq ZERO,TL, .L_N2_M1_L7 ++.align 5 ++.L_N2_M1_TL1: /* TL-- */ ++ KERNEL8xMx2 1, 0x04 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N2_M1_TL1 ++.L_N2_M1_L7: ++ /* if (!(L & 7)) goto L_N2_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N2_M1_L0 ++.align 5 ++.L_N2_M1_L71: ++ KERNEL1xMx2 1, 0x04 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N2_M1_L71 ++.L_N2_M1_L0: ++ SAVEMx2 1, 0x04 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -1 ++#else ++ PTR_ADDI L, L, -2 ++#endif ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N2_M0: ++ /* Add stride for B and C ++ * B += 2 * K ++ * C += 2 * LDC ++ */ ++ PTR_SLLI T0, K, 3 ++ PTR_SLLI T1, LDC, 3 ++ PTR_ADD B, B, T0 ++ PTR_ADD C, C, T1 ++#if defined(TRMMKERNEL) && !defined(LEFT) ++ PTR_ADDI OFF, OFF, 0x02 ++#endif ++ /* We must reinit I */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++.L_N1: ++ andi J, N, 1 ++ beq ZERO, J, .L_N0 ++ move C0, C ++ move A0, A ++ ++#if defined(TRMMKERNEL) && defined(LEFT) ++ move OFF, OFFSET ++#endif ++ /* if (!(M >> 4)) goto L_N1_M8 */ ++ PTR_SRAI I, M, 4 /* I = bm >> 4 */ ++ beq ZERO, I, .L_N1_M8 ++.L_N1_M16: ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 16 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1x16x1_START ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M16_L7 */ ++ beq ZERO,TL, .L_N1_M16_L7 ++.align 5 ++.L_N1_M16_TL1: /* TL-- */ ++ KERNEL8x16x1 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M16_TL1 ++.L_N1_M16_L7: ++ /* if (!(L & 7)) goto L_N1_M16_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M16_L0 ++.align 5 ++.L_N1_M16_L71: ++ KERNEL1x16x1 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N1_M16_L71 ++.L_N1_M16_L0: ++ SAVE16x1 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -16 ++#else ++ PTR_ADDI L, L, -1 ++#endif ++ PTR_SLLI T0, L, 0x06 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x10 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++ PTR_ADDI I, I, -1 /* I-- */ ++ blt ZERO,I, .L_N1_M16 ++.L_N1_M8: ++ /* We have done M & 16, considering M=8/4/2/1 */ ++ andi I, M, 15 ++ beq ZERO,I, .L_N1_M0 ++ ++ andi I, M, 8 ++ beq ZERO,I, .L_N1_M4 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 8 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx1_START 8, 0x20 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M8_L7 */ ++ beq ZERO,TL, .L_N1_M8_L7 ++.align 5 ++.L_N1_M8_TL1: /* TL-- */ ++ KERNEL8xMx1 8, 0x20 ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M8_TL1 ++.L_N1_M8_L7: ++ /* if (!(L & 7)) goto L_N1_M8_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M8_L0 ++.align 5 ++.L_N1_M8_L71: ++ KERNEL1xMx1 8, 0x20 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N1_M8_L71 ++.L_N1_M8_L0: ++ SAVEMx1 8, 0x20 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -8 ++#else ++ PTR_ADDI L, L, -1 ++#endif ++ PTR_SLLI T0, L, 0x05 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x08 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N1_M4: ++ andi I, M, 4 ++ beq ZERO,I, .L_N1_M2 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 4 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx1_START 4, 0x10 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M4_L7 */ ++ beq ZERO,TL, .L_N1_M4_L7 ++.align 5 ++.L_N1_M4_TL1: /* TL-- */ ++ KERNEL8xMx1 4, 0x10 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M4_TL1 ++.L_N1_M4_L7: ++ /* if (!(L & 7)) goto L_N1_M4_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M4_L0 ++.align 5 ++.L_N1_M4_L71: ++ KERNEL1xMx1 4, 0x10 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N1_M4_L71 ++.L_N1_M4_L0: ++ SAVEMx1 4, 0x10 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -4 ++#else ++ PTR_ADDI L, L, -1 ++#endif ++ PTR_SLLI T0, L, 0x04 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x04 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N1_M2: ++ andi I, M, 2 ++ beq ZERO,I, .L_N1_M1 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 2 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx1_START 2, 0x08 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M2_L7 */ ++ beq ZERO,TL, .L_N1_M2_L7 ++.align 5 ++.L_N1_M2_TL1: /* TL-- */ ++ KERNEL8xMx1 2, 0x08 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M2_TL1 ++.L_N1_M2_L7: ++ /* if (!(L & 7)) goto L_N1_M2_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M2_L0 ++.align 5 ++.L_N1_M2_L71: ++ KERNEL1xMx1 2, 0x08 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N1_M2_L71 ++.L_N1_M2_L0: ++ SAVEMx1 2, 0x08 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -2 ++#else ++ PTR_ADDI L, L, -1 ++#endif ++ PTR_SLLI T0, L, 0x03 ++ PTR_ADD A0, A0, T0 ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x02 ++#endif ++#endif // #if defined(TRMMKERNEL) ++ ++.L_N1_M1: ++ andi I, M, 1 ++ beq ZERO,I, .L_N1_M0 ++ ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ move B0, B ++#else ++ PTR_SLLI T0, OFF, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_ADD B0, B, T0 ++#endif ++ ++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#elif defined(LEFT) ++ /* number of values in A */ ++ PTR_ADDI L, OFF, 1 ++#else ++ /* number of values in B */ ++ PTR_ADDI L, OFF, 1 ++#endif ++#else // #if !defined(TRMMKERNEL) ++ move B0, B ++ move L, K /* L = bk */ ++#endif ++ KERNEL1xMx1_START 1, 0x04 ++ /* Reduce L */ ++ PTR_ADDI L, L, -1 ++ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ ++ /* if (TL < 1) goto L_N1_M1_L7 */ ++ beq ZERO,TL, .L_N1_M1_L7 ++.align 5 ++.L_N1_M1_TL1: /* TL-- */ ++ KERNEL8xMx1 1, 0x04 ++ ++ PTR_ADDI TL, TL, -1 /* TL-- */ ++ blt ZERO,TL, .L_N1_M1_TL1 ++.L_N1_M1_L7: ++ /* if (!(L & 7)) goto L_N1_M1_L0 */ ++ andi TL, L, 7 ++ beq TL, ZERO,.L_N1_M1_L0 ++.align 5 ++.L_N1_M1_L71: ++ KERNEL1xMx1 1, 0x04 ++ PTR_ADDI TL, TL, -1 ++ blt ZERO,TL, .L_N1_M1_L71 ++.L_N1_M1_L0: ++ SAVEMx1 1, 0x04 ++#if defined(TRMMKERNEL) ++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) ++ PTR_SUB L, K, OFF ++#ifdef LEFT ++ PTR_ADDI L, L, -1 ++#else ++ PTR_ADDI L, L, -1 ++#endif ++ PTR_SLLI T0, L, 0x02 ++ PTR_ADD A0, A0, T0 ++ PTR_ADD B0, B0, T0 ++#endif ++ ++#ifdef LEFT ++ PTR_ADDI OFF, OFF, 0x01 ++#endif ++#endif // #if defined(TRMMKERNEL) ++.L_N1_M0: ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S +new file mode 100644 +index 000000000..266c07c5c +--- /dev/null ++++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S +@@ -0,0 +1,463 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/23 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++*********************************************************************/ ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define S9 $r20 ++#define S10 $r23 ++#define S11 $r24 ++#define S12 $r25 ++#define S13 $r26 ++#define S14 $r27 ++#define S15 $r28 ++#define S16 $r29 ++#define TD $r30 ++#define TS $r31 ++#define TL $r7 ++#define T0 $r6 ++#undef ZERO ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++#define F4 $f4 ++#define F5 $f5 ++#define F6 $f6 ++#define F7 $f7 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define U8 $xr8 ++#define U9 $xr9 ++#define U10 $xr10 ++#define U11 $xr11 ++#define U12 $xr12 ++#define U13 $xr13 ++#define U14 $xr14 ++#define U15 $xr15 ++#define D0 $xr16 ++#define D1 $xr17 ++#define D2 $xr18 ++#define D3 $xr19 ++#define D4 $xr20 ++#define D5 $xr21 ++#define D6 $xr22 ++#define D7 $xr23 ++#define D8 $xr24 ++#define D9 $xr25 ++#define D10 $xr26 ++#define D11 $xr27 ++#define D12 $xr28 ++#define D13 $xr29 ++#define D14 $xr30 ++#define D15 $xr31 ++ ++// Loops outline ++//.L_N16 <------------------- ++//| .L_M8: | ++//| .L_M7: | Main Loop ++//| .L_M1: | ++//| .L_M0: --------------- ++//.L_N15: ++//.L_N8: ++//| .L_N8_M8: ++//| .L_N8_M7: ++//| .L_N8_M1: ++//.L_N7: ++//.L_N4: ++//| .L_N4_M4: ++//| .L_N4_M3: ++//| .L_N4_M1: ++//.L_N3: ++//.L_N2: ++//| .L_N2_M2: ++//| .L_N2_M1: ++//.L_N1: ++//| .L_N1_M1: ++//.L_N0 ++ ++ PROLOGUE ++ push_if_used 26, 32 ++ ++ move TD, DST ++ move TS, SRC ++ PTR_SLLI TL, LDA, 0x02 ++ PTR_SLLI T0, TL, 0x01 ++ PTR_SRAI J, N, 0x04 ++ beq J, ZERO, .L_N15 ++.align 5 ++.L_N16: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x03 ++ PTR_ADD S3, S2, TL ++ PTR_ADDI J, J, -1 ++ PTR_ADD S4, S3, TL ++ PTR_ADD S5, S3, T0 ++ PTR_ADD S6, S4, T0 ++ PTR_ADD S7, S5, T0 ++ PTR_ADD S8, S6, T0 ++ PTR_ADD S9, S7, T0 ++ PTR_ADD S10, S8, T0 ++ PTR_ADD S11, S9, T0 ++ PTR_ADD S12, S10, T0 ++ PTR_ADD S13, S11, T0 ++ PTR_ADD S14, S12, T0 ++ PTR_ADD S15, S13, T0 ++ PTR_ADD S16, S14, T0 ++ PTR_ADD TS, S15, T0 ++ beq I, ZERO, .L_M7 ++.align 5 ++.L_M8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ xvld U8, S9, 0x00 ++ xvld U9, S10, 0x00 ++ xvld U10, S11, 0x00 ++ xvld U11, S12, 0x00 ++ xvld U12, S13, 0x00 ++ xvld U13, S14, 0x00 ++ xvld U14, S15, 0x00 ++ xvld U15, S16, 0x00 ++ ++ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ ++ U0, U1, U2, U3, U4, U5, U6, U7, \ ++ D1, D3, D5, D7 // As tmp ++ GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ ++ U8, U9, U10, U11, U12, U13, U14, U15, \ ++ U0, U1, U2, U3 // As tmp ++ GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ ++ D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 ++ PTR_ADDI TD, TD, 0x100 ++ GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ ++ D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 ++ PTR_ADDI TD, TD, 0x100 ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI S5, S5, 0x20 ++ PTR_ADDI S6, S6, 0x20 ++ PTR_ADDI S7, S7, 0x20 ++ PTR_ADDI S8, S8, 0x20 ++ PTR_ADDI S9, S9, 0x20 ++ PTR_ADDI S10, S10, 0x20 ++ PTR_ADDI S11, S11, 0x20 ++ PTR_ADDI S12, S12, 0x20 ++ PTR_ADDI S13, S13, 0x20 ++ PTR_ADDI S14, S14, 0x20 ++ PTR_ADDI S15, S15, 0x20 ++ PTR_ADDI S16, S16, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_M8 ++.L_M7: ++ andi I, M, 0x07 ++ beq I, ZERO, .L_M0 ++.align 5 ++.L_M1: ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ fld.s F2, S3, 0x00 ++ fld.s F3, S4, 0x00 ++ fld.s F4, S5, 0x00 ++ fld.s F5, S6, 0x00 ++ fld.s F6, S7, 0x00 ++ fld.s F7, S8, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ fst.s F1, TD, 0x04 ++ fst.s F2, TD, 0x08 ++ fst.s F3, TD, 0x0C ++ fst.s F4, TD, 0x10 ++ fst.s F5, TD, 0x14 ++ fst.s F6, TD, 0x18 ++ fst.s F7, TD, 0x1C ++ ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI S3, S3, 0x04 ++ PTR_ADDI S4, S4, 0x04 ++ PTR_ADDI S5, S5, 0x04 ++ PTR_ADDI S6, S6, 0x04 ++ PTR_ADDI S7, S7, 0x04 ++ PTR_ADDI S8, S8, 0x04 ++ PTR_ADDI TD, TD, 0x20 ++ ++ fld.s F0, S9, 0x00 ++ fld.s F1, S10, 0x00 ++ fld.s F2, S11, 0x00 ++ fld.s F3, S12, 0x00 ++ fld.s F4, S13, 0x00 ++ fld.s F5, S14, 0x00 ++ fld.s F6, S15, 0x00 ++ fld.s F7, S16, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ fst.s F1, TD, 0x04 ++ fst.s F2, TD, 0x08 ++ fst.s F3, TD, 0x0C ++ fst.s F4, TD, 0x10 ++ fst.s F5, TD, 0x14 ++ fst.s F6, TD, 0x18 ++ fst.s F7, TD, 0x1C ++ ++ PTR_ADDI S9, S9, 0x04 ++ PTR_ADDI S10, S10, 0x04 ++ PTR_ADDI S11, S11, 0x04 ++ PTR_ADDI S12, S12, 0x04 ++ PTR_ADDI S13, S13, 0x04 ++ PTR_ADDI S14, S14, 0x04 ++ PTR_ADDI S15, S15, 0x04 ++ PTR_ADDI S16, S16, 0x04 ++ PTR_ADDI TD, TD, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_M1 ++.L_M0: ++ blt ZERO, J, .L_N16 ++.L_N15: ++ andi J, N, 0x0f ++ beq ZERO, J, .L_N0 ++ ++ andi J, N, 0x08 ++ beq ZERO, J, .L_N7 ++.L_N8: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x03 ++ PTR_ADD S3, S2, TL ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S5, S3, T0 ++ PTR_ADD S6, S4, T0 ++ PTR_ADD S7, S5, T0 ++ PTR_ADD S8, S6, T0 ++ PTR_ADD TS, S7, T0 ++ beq I, ZERO, .L_N8_M7 ++.align 5 ++.L_N8_M8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ ++ U0, U1, U2, U3, U4, U5, U6, U7, \ ++ D1, D3, D5, D7 // As tmp ++ GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ ++ D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 ++ PTR_ADDI TD, TD, 0x100 ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI S5, S5, 0x20 ++ PTR_ADDI S6, S6, 0x20 ++ PTR_ADDI S7, S7, 0x20 ++ PTR_ADDI S8, S8, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N8_M8 ++.L_N8_M7: ++ andi I, M, 0x07 ++ beq I, ZERO, .L_N7 ++.align 5 ++.L_N8_M1: ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ fld.s F2, S3, 0x00 ++ fld.s F3, S4, 0x00 ++ fld.s F4, S5, 0x00 ++ fld.s F5, S6, 0x00 ++ fld.s F6, S7, 0x00 ++ fld.s F7, S8, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ fst.s F2, TD, 0x08 ++ PTR_ADDI S3, S3, 0x04 ++ fst.s F3, TD, 0x0C ++ PTR_ADDI S4, S4, 0x04 ++ fst.s F4, TD, 0x10 ++ PTR_ADDI S5, S5, 0x04 ++ fst.s F5, TD, 0x14 ++ PTR_ADDI S6, S6, 0x04 ++ fst.s F6, TD, 0x18 ++ PTR_ADDI S7, S7, 0x04 ++ fst.s F7, TD, 0x1C ++ PTR_ADDI S8, S8, 0x04 ++ ++ PTR_ADDI TD, TD, 0x20 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N8_M1 ++.L_N7: ++ andi J, N, 0x07 ++ beq ZERO, J, .L_N0 ++ ++ andi J, N, 0x04 ++ beq ZERO, J, .L_N3 ++.L_N4: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x02 ++ PTR_ADD S3, S2, TL ++ PTR_ADD S4, S2, T0 ++ PTR_ADD TS, S3, T0 ++ beq I, ZERO, .L_N4_M3 ++.align 5 ++.L_N4_M4: ++ GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 ++ GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 ++ GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 ++ GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 ++ GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 ++ GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI TD, TD, 0x40 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N4_M4 ++.L_N4_M3: ++ andi I, M, 0x03 ++ beq I, ZERO, .L_N3 ++.align 5 ++.L_N4_M1: ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ fld.s F2, S3, 0x00 ++ fld.s F3, S4, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ fst.s F2, TD, 0x08 ++ PTR_ADDI S3, S3, 0x04 ++ fst.s F3, TD, 0x0C ++ PTR_ADDI S4, S4, 0x04 ++ ++ PTR_ADDI TD, TD, 0x10 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N4_M1 ++.L_N3: ++ andi J, N, 0x03 ++ beq ZERO, J, .L_N0 ++ ++ andi J, N, 0x02 ++ beq ZERO, J, .L_N1 ++.L_N2: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x01 ++ PTR_ADD TS, S2, TL ++ beq I, ZERO, .L_N2_M1 ++.align 5 ++.L_N2_M2: ++ GLD f, d, F0, S1, 0x00, F1, S2, 0x00 ++ vilvl.w $vr0, $vr1, $vr0 ++ GST v, , $vr0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI TD, TD, 0x10 ++ ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N2_M2 ++.L_N2_M1: ++ andi I, M, 0x01 ++ beq I, ZERO, .L_N1 ++ ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI TD, TD, 0x08 ++.align 5 ++.L_N1: ++ move S1, TS ++ beq ZERO, M, .L_N0 ++.L_N1_M1: ++ fld.s F0, S1, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F0, TD, 0x00 ++ PTR_ADDI TD, TD, 0x04 ++ PTR_ADDI M, M, -1 ++ blt ZERO, M, .L_N1_M1 ++.L_N0: ++ pop_if_used 26, 32 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S +new file mode 100644 +index 000000000..5c173568b +--- /dev/null ++++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S +@@ -0,0 +1,298 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/23 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++*********************************************************************/ ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define TD $r20 ++#define TS $r11 ++#define TL $r7 ++#define T0 $r6 ++#undef ZERO ++#define ZERO $r0 ++ ++#define F0 $f0 ++#define F1 $f1 ++#define F2 $f2 ++#define F3 $f3 ++#define F4 $f4 ++#define F5 $f5 ++#define F6 $f6 ++#define F7 $f7 ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++#define D0 $xr8 ++#define D1 $xr9 ++#define D2 $xr10 ++#define D3 $xr11 ++#define D4 $xr12 ++#define D5 $xr13 ++#define D6 $xr14 ++#define D7 $xr15 ++#define D8 $xr16 ++#define D10 $xr17 ++#define D12 $xr18 ++#define D14 $xr19 ++ ++// Loops outline ++//.L_N8: <---------------- ++//| .L_M8: | ++//| .L_M7: | Main Loop ++//| .L_M1: | ++//| .L_M0:-------------- ++//.L_N7: ++//.L_N4: ++//| .L_N4_M4: ++//| .L_N4_M3: ++//| .L_N4_M1: ++//.L_N3: ++//.L_N2: ++//| .L_N2_M2: ++//| .L_N2_M1: ++//.L_N1: ++//| .L_N1_M1: ++//.L_N0 ++ ++ PROLOGUE ++ push_if_used 17, 20 ++ ++ move TD, DST ++ move TS, SRC ++ PTR_SLLI TL, LDA, 0x02 ++ PTR_SLLI T0, TL, 0x01 ++ PTR_SRAI J, N, 0x03 ++ beq J, ZERO, .L_N7 ++.align 5 ++.L_N8: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x03 ++ PTR_ADD S3, S2, TL ++ PTR_ADDI J, J, -1 ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S5, S3, T0 ++ PTR_ADD S6, S4, T0 ++ PTR_ADD S7, S5, T0 ++ PTR_ADD S8, S6, T0 ++ PTR_ADD TS, S7, T0 ++ beq I, ZERO, .L_M7 ++.align 5 ++.L_M8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ ++ U0, U1, U2, U3, U4, U5, U6, U7, \ ++ D1, D3, D5, D7 // As tmp ++ GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ ++ D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 ++ PTR_ADDI TD, TD, 0x100 ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI S5, S5, 0x20 ++ PTR_ADDI S6, S6, 0x20 ++ PTR_ADDI S7, S7, 0x20 ++ PTR_ADDI S8, S8, 0x20 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_M8 ++.L_M7: ++ andi I, M, 0x07 ++ beq I, ZERO, .L_M0 ++.align 5 ++.L_M1: ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ fld.s F2, S3, 0x00 ++ fld.s F3, S4, 0x00 ++ fld.s F4, S5, 0x00 ++ fld.s F5, S6, 0x00 ++ fld.s F6, S7, 0x00 ++ fld.s F7, S8, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ fst.s F2, TD, 0x08 ++ PTR_ADDI S3, S3, 0x04 ++ fst.s F3, TD, 0x0C ++ PTR_ADDI S4, S4, 0x04 ++ fst.s F4, TD, 0x10 ++ PTR_ADDI S5, S5, 0x04 ++ fst.s F5, TD, 0x14 ++ PTR_ADDI S6, S6, 0x04 ++ fst.s F6, TD, 0x18 ++ PTR_ADDI S7, S7, 0x04 ++ fst.s F7, TD, 0x1C ++ PTR_ADDI S8, S8, 0x04 ++ ++ PTR_ADDI TD, TD, 0x20 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_M1 ++.L_M0: ++ blt ZERO, J, .L_N8 ++.L_N7: ++ andi J, N, 0x07 ++ beq ZERO, J, .L_N0 ++ ++ andi J, N, 0x04 ++ beq ZERO, J, .L_N3 ++.L_N4: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x02 ++ PTR_ADD S3, S2, TL ++ PTR_ADD S4, S2, T0 ++ PTR_ADD TS, S3, T0 ++ beq I, ZERO, .L_N4_M3 ++.align 5 ++.L_N4_M4: ++ GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 ++ GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 ++ GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 ++ GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 ++ GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 ++ GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI TD, TD, 0x40 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N4_M4 ++.L_N4_M3: ++ andi I, M, 0x03 ++ beq I, ZERO, .L_N3 ++.align 5 ++.L_N4_M1: ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ fld.s F2, S3, 0x00 ++ fld.s F3, S4, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ fst.s F2, TD, 0x08 ++ PTR_ADDI S3, S3, 0x04 ++ fst.s F3, TD, 0x0C ++ PTR_ADDI S4, S4, 0x04 ++ ++ PTR_ADDI TD, TD, 0x10 ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N4_M1 ++.L_N3: ++ andi J, N, 0x03 ++ beq ZERO, J, .L_N0 ++ ++ andi J, N, 0x02 ++ beq ZERO, J, .L_N1 ++.L_N2: ++ move S1, TS ++ PTR_ADD S2, TS, TL ++ PTR_SRAI I, M, 0x01 ++ PTR_ADD TS, S2, TL ++ beq I, ZERO, .L_N2_M1 ++.align 5 ++.L_N2_M2: ++ GLD f, d, F0, S1, 0x00, F1, S2, 0x00 ++ vilvl.w $vr0, $vr1, $vr0 ++ GST v, , $vr0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI TD, TD, 0x10 ++ ++ PTR_ADDI I, I, -1 ++ blt ZERO, I, .L_N2_M2 ++.L_N2_M1: ++ andi I, M, 0x01 ++ beq I, ZERO, .L_N1 ++ ++ fld.s F0, S1, 0x00 ++ fld.s F1, S2, 0x00 ++ ++ fst.s F0, TD, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F1, TD, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI TD, TD, 0x08 ++.align 5 ++.L_N1: ++ move S1, TS ++ beq ZERO, M, .L_N0 ++.L_N1_M1: ++ fld.s F0, S1, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ fst.s F0, TD, 0x00 ++ PTR_ADDI TD, TD, 0x04 ++ PTR_ADDI M, M, -1 ++ blt ZERO, M, .L_N1_M1 ++.L_N0: ++ pop_if_used 17, 20 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S +new file mode 100644 +index 000000000..d9789bdcd +--- /dev/null ++++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S +@@ -0,0 +1,526 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/23 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++*********************************************************************/ ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S0 $r11 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define P0 $r20 ++#define P1 $r23 ++#define P2 $r24 ++#define P3 $r25 ++#define P4 $r26 ++#define P5 $r27 ++#define T0 $r28 ++#define T1 $r29 ++#define TL $r7 ++#define ZERO $r0 ++ ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++ ++// Loops outline ++//.L_M8 <------------------- ++//| .L_N16: | ++//| .L_N15: | ++//| .L_N8: | ++//| .L_N7: | Main Loop ++//| .L_N4: | ++//| .L_N3: | ++//| .L_N2: | ++//| .L_N1: | ++//| .L_N0: --------------- ++//.L_M7 ++//.L_M4 ++//| .L_M4_N16: ++//| .L_M4_N15: ++//| .L_M4_N8: ++//| .L_M4_N7: ++//| .L_M4_N4: ++//| .L_M4_N3: ++//| .L_M4_N2: ++//| .L_M4_N1: ++//.L_M3 ++//.L_M2 ++//| .L_M2_N16: ++//| .L_M2_N15: ++//| .L_M2_N8: ++//| .L_M2_N7: ++//| .L_M2_N4: ++//| .L_M2_N3: ++//| .L_M2_N2: ++//| .L_M2_N1: ++//.L_M1 ++//| .L_M1_N16: ++//| .L_M1_N15: ++//| .L_M1_N8: ++//| .L_M1_N7: ++//| .L_M1_N4: ++//| .L_M1_N3: ++//| .L_M1_N2: ++//| .L_M1_N1: ++//.L_M0 ++ ++ PROLOGUE ++ push_if_used 24, 8 ++ ++ move S0, SRC ++ move P0, DST ++ ++ PTR_SRAI T0, N, 0x04 ++ PTR_SRAI T1, N, 0x03 ++ PTR_SLLI T0, T0, 0x04 ++ PTR_SLLI T1, T1, 0x03 ++ ++ PTR_MUL P2, M, T0 ++ PTR_MUL P3, M, T1 ++ PTR_SLLI P2, P2, 0x02 ++ PTR_SLLI P3, P3, 0x02 ++ PTR_ADD P2, DST, P2 ++ PTR_ADD P3, DST, P3 ++ ++ PTR_SRAI T0, N, 0x02 ++ PTR_SRAI T1, N, 0x01 ++ PTR_SLLI T0, T0, 0x02 ++ PTR_SLLI T1, T1, 0x01 ++ PTR_MUL P4, M, T0 ++ PTR_MUL P5, M, T1 ++ PTR_SLLI P4, P4, 0x02 ++ PTR_SLLI P5, P5, 0x02 ++ PTR_ADD P4, DST, P4 ++ PTR_ADD P5, DST, P5 ++ ++ PTR_SLLI TL, LDA, 0x02 ++ PTR_SRAI J, M, 0x03 ++ PTR_SLLI T0, TL, 0x01 ++ PTR_SLLI T1, M, 0x06 ++ beq ZERO, J, .L_M7 ++.align 5 ++.L_M8: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S3, S1, T0 ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S5, S3, T0 ++ PTR_ADD S6, S4, T0 ++ PTR_ADD S7, S5, T0 ++ PTR_ADD S8, S6, T0 ++ PTR_ADD S0, S7, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x200 ++ ++ PTR_SRAI I, N, 0x04 ++ PTR_ADDI J, J, -1 ++ beq ZERO, I, .L_N15 ++.L_N16: ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ ++ xvld U4, S3, 0x00 ++ xvld U5, S3, 0x20 ++ xvld U6, S4, 0x00 ++ xvld U7, S4, 0x20 ++ ++ xvst U4, P1, 0x80 ++ xvst U5, P1, 0xA0 ++ xvst U6, P1, 0xC0 ++ xvst U7, P1, 0xE0 ++ ++ xvld U0, S5, 0x00 ++ xvld U1, S5, 0x20 ++ xvld U2, S6, 0x00 ++ xvld U3, S6, 0x20 ++ ++ xvst U0, P1, 0x100 ++ xvst U1, P1, 0x120 ++ xvst U2, P1, 0x140 ++ xvst U3, P1, 0x160 ++ ++ xvld U4, S7, 0x00 ++ xvld U5, S7, 0x20 ++ xvld U6, S8, 0x00 ++ xvld U7, S8, 0x20 ++ ++ xvst U4, P1, 0x180 ++ xvst U5, P1, 0x1A0 ++ xvst U6, P1, 0x1C0 ++ xvst U7, P1, 0x1E0 ++ ++ PTR_ADDI S1, S1, 0x40 ++ PTR_ADDI S2, S2, 0x40 ++ PTR_ADDI S3, S3, 0x40 ++ PTR_ADDI S4, S4, 0x40 ++ PTR_ADDI S5, S5, 0x40 ++ PTR_ADDI S6, S6, 0x40 ++ PTR_ADDI S7, S7, 0x40 ++ PTR_ADDI S8, S8, 0x40 ++ ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_N16 ++.L_N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_N7 ++.L_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ ++ U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI S5, S5, 0x20 ++ PTR_ADDI S6, S6, 0x20 ++ PTR_ADDI S7, S7, 0x20 ++ PTR_ADDI S8, S8, 0x20 ++ PTR_ADDI P2, P2, 0x100 ++.L_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_N3 ++.L_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ ++ $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 ++ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ ++ $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI S5, S5, 0x10 ++ PTR_ADDI S6, S6, 0x10 ++ PTR_ADDI S7, S7, 0x10 ++ PTR_ADDI S8, S8, 0x10 ++ PTR_ADDI P3, P3, 0x80 ++.L_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_N1 ++.L_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ ++ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 ++ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ ++ $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI S3, S3, 0x08 ++ PTR_ADDI S4, S4, 0x08 ++ PTR_ADDI S5, S5, 0x08 ++ PTR_ADDI S6, S6, 0x08 ++ PTR_ADDI S7, S7, 0x08 ++ PTR_ADDI S8, S8, 0x08 ++ PTR_ADDI P4, P4, 0x40 ++.L_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_N0 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ ++ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 ++ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ ++ $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI S3, S3, 0x04 ++ PTR_ADDI S4, S4, 0x04 ++ PTR_ADDI S5, S5, 0x04 ++ PTR_ADDI S6, S6, 0x04 ++ PTR_ADDI S7, S7, 0x04 ++ PTR_ADDI S8, S8, 0x04 ++ PTR_ADDI P5, P5, 0x20 ++.L_N0: ++ blt ZERO, J, .L_M8 ++.L_M7: ++ andi J, M, 0x04 ++ beq ZERO, J, .L_M3 ++.L_M4: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S3, S1, T0 ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S0, S3, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x100 ++ ++ PTR_SRAI I, N, 0x04 ++ beq ZERO, I, .L_M4_N15 ++.align 5 ++.L_M4_N16: ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ ++ xvld U4, S3, 0x00 ++ xvld U5, S3, 0x20 ++ xvld U6, S4, 0x00 ++ xvld U7, S4, 0x20 ++ ++ xvst U4, P1, 0x80 ++ xvst U5, P1, 0xA0 ++ xvst U6, P1, 0xC0 ++ xvst U7, P1, 0xE0 ++ ++ PTR_ADDI S1, S1, 0x40 ++ PTR_ADDI S2, S2, 0x40 ++ PTR_ADDI S3, S3, 0x40 ++ PTR_ADDI S4, S4, 0x40 ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M4_N16 ++.L_M4_N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_M4_N7 ++.L_M4_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI P2, P2, 0x80 ++.L_M4_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M4_N3 ++.L_M4_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 ++ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI P3, P3, 0x40 ++.L_M4_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M4_N1 ++.L_M4_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 ++ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI S3, S3, 0x08 ++ PTR_ADDI S4, S4, 0x08 ++ PTR_ADDI P4, P4, 0x20 ++.L_M4_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M3 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 ++ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI S3, S3, 0x04 ++ PTR_ADDI S4, S4, 0x04 ++ PTR_ADDI P5, P5, 0x10 ++.L_M3: ++ andi J, M, 0x02 ++ beq ZERO, J, .L_M1 ++.L_M2: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S0, S0, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x80 ++ ++ PTR_SRAI I, N, 0x04 ++ beq ZERO, I, .L_M2_N15 ++.align 5 ++.L_M2_N16: ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ xvld U2, S2, 0x00 ++ xvld U3, S2, 0x20 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ xvst U2, P1, 0x40 ++ xvst U3, P1, 0x60 ++ ++ PTR_ADDI S1, S1, 0x40 ++ PTR_ADDI S2, S2, 0x40 ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M2_N16 ++.L_M2_N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_M2_N7 ++.L_M2_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ GST xv, , U0, P2, 0x00, U1, P2, 0x20 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI P2, P2, 0x40 ++.L_M2_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M2_N3 ++.L_M2_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 ++ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI P3, P3, 0x20 ++.L_M2_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M2_N1 ++.L_M2_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 ++ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI P4, P4, 0x10 ++.L_M2_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M1 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 ++ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI P5, P5, 0x08 ++.L_M1: ++ andi J, M, 0x01 ++ beq ZERO, J, .L_M0 ++ ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x40 ++ ++ PTR_SRAI I, N, 0x04 ++ beq ZERO, I, .L_M1_N15 ++.align 5 ++.L_M1_N16: ++ xvld U0, S1, 0x00 ++ xvld U1, S1, 0x20 ++ ++ xvst U0, P1, 0x00 ++ xvst U1, P1, 0x20 ++ ++ PTR_ADDI S1, S1, 0x40 ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M1_N16 ++.L_M1_N15: ++ andi I, N, 0x08 ++ beq ZERO, I, .L_M1_N7 ++.L_M1_N8: ++ xvld U0, S1, 0x00 ++ ++ GST xv, , U0, P2, 0x00 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI P2, P2, 0x20 ++.L_M1_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M1_N3 ++.L_M1_N4: ++ GLD v, , $vr0, S1, 0x00 ++ GST v, , $vr0, P3, 0x00 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI P3, P3, 0x10 ++.L_M1_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M1_N1 ++.L_M1_N2: ++ GLD f, d, $f0, S1, 0x00 ++ GST f, d, $f0, P4, 0x00 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI P4, P4, 0x08 ++.L_M1_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M0 ++ ++ GLD f, s, $f0, S1, 0x00 ++ GST f, s, $f0, P5, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI P5, P5, 0x04 ++.L_M0: ++ pop_if_used 24, 8 ++ jirl $r0, $r1, 0x00 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S +new file mode 100644 +index 000000000..725a47a60 +--- /dev/null ++++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S +@@ -0,0 +1,406 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/23 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++*********************************************************************/ ++ ++/* Function parameters */ ++#define M $r4 // param 1: m ++#define N $r5 // param 2: n ++#define SRC $r6 // param 3: src ++#define LDA $r7 // param 4: lda ++#define DST $r8 // param 5: dst ++ ++#define I $r9 ++#define J $r10 ++#define S0 $r11 ++#define S1 $r12 ++#define S2 $r13 ++#define S3 $r14 ++#define S4 $r15 ++#define S5 $r16 ++#define S6 $r17 ++#define S7 $r18 ++#define S8 $r19 ++#define P0 $r20 ++#define P1 $r23 ++#define P2 $r24 ++#define P3 $r25 ++#define P4 $r26 ++#define T0 $r27 ++#define T1 $r28 ++#define TL $r7 ++#undef ZERO ++#define ZERO $r0 ++ ++/* LASX vectors */ ++#define U0 $xr0 ++#define U1 $xr1 ++#define U2 $xr2 ++#define U3 $xr3 ++#define U4 $xr4 ++#define U5 $xr5 ++#define U6 $xr6 ++#define U7 $xr7 ++ ++// Loops outline ++//.L_M8 <------------------- ++//| .L_N8: | ++//| .L_N7: | Main Loop ++//| .L_N4: | ++//| .L_N3: | ++//| .L_N2: | ++//| .L_N1: | ++//| .L_N0: --------------- ++//.L_M7 ++//.L_M4 ++//| .L_M4_N8: ++//| .L_M4_N7: ++//| .L_M4_N4: ++//| .L_M4_N3: ++//| .L_M4_N2: ++//| .L_M4_N1: ++//.L_M3 ++//.L_M2 ++//| .L_M2_N8: ++//| .L_M2_N7: ++//| .L_M2_N4: ++//| .L_M2_N3: ++//| .L_M2_N2: ++//| .L_M2_N1: ++//.L_M1 ++//| .L_M1_N8: ++//| .L_M1_N7: ++//| .L_M1_N4: ++//| .L_M1_N3: ++//| .L_M1_N2: ++//| .L_M1_N1: ++//.L_M0 ++ ++ PROLOGUE ++ push_if_used 23, 8 ++ ++ move S0, SRC ++ move P0, DST ++ ++ PTR_SRAI T0, N, 0x04 ++ PTR_SRAI T1, N, 0x03 ++ PTR_SLLI T0, T0, 0x04 ++ PTR_SLLI T1, T1, 0x03 ++ ++ PTR_MUL P2, M, T1 ++ PTR_SLLI P2, P2, 0x02 ++ PTR_ADD P2, DST, P2 ++ PTR_SRAI T0, N, 0x02 ++ PTR_SRAI T1, N, 0x01 ++ PTR_SLLI T0, T0, 0x02 ++ PTR_SLLI T1, T1, 0x01 ++ PTR_MUL P3, M, T0 ++ PTR_MUL P4, M, T1 ++ PTR_SLLI P3, P3, 0x02 ++ PTR_SLLI P4, P4, 0x02 ++ PTR_ADD P3, DST, P3 ++ PTR_ADD P4, DST, P4 ++ ++ PTR_SLLI TL, LDA, 0x02 ++ PTR_SRAI J, M, 0x03 ++ PTR_SLLI T0, TL, 0x01 ++ PTR_SLLI T1, M, 0x05 ++ beq ZERO, J, .L_M7 ++.align 5 ++.L_M8: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S3, S1, T0 ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S5, S3, T0 ++ PTR_ADD S6, S4, T0 ++ PTR_ADD S7, S5, T0 ++ PTR_ADD S8, S6, T0 ++ PTR_ADD S0, S7, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x100 ++ ++ PTR_SRAI I, N, 0x03 ++ PTR_ADDI J, J, -1 ++ beq ZERO, I, .L_N7 ++.L_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ xvld U4, S5, 0x00 ++ xvld U5, S6, 0x00 ++ xvld U6, S7, 0x00 ++ xvld U7, S8, 0x00 ++ ++ GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ ++ U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ PTR_ADDI S5, S5, 0x20 ++ PTR_ADDI S6, S6, 0x20 ++ PTR_ADDI S7, S7, 0x20 ++ PTR_ADDI S8, S8, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_N8 ++.L_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_N3 ++.L_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ ++ $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 ++ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ ++ $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI S5, S5, 0x10 ++ PTR_ADDI S6, S6, 0x10 ++ PTR_ADDI S7, S7, 0x10 ++ PTR_ADDI S8, S8, 0x10 ++ PTR_ADDI P2, P2, 0x80 ++.L_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_N1 ++.L_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ ++ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 ++ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ ++ $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI S3, S3, 0x08 ++ PTR_ADDI S4, S4, 0x08 ++ PTR_ADDI S5, S5, 0x08 ++ PTR_ADDI S6, S6, 0x08 ++ PTR_ADDI S7, S7, 0x08 ++ PTR_ADDI S8, S8, 0x08 ++ PTR_ADDI P3, P3, 0x40 ++.L_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_N0 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ ++ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 ++ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ ++ $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI S3, S3, 0x04 ++ PTR_ADDI S4, S4, 0x04 ++ PTR_ADDI S5, S5, 0x04 ++ PTR_ADDI S6, S6, 0x04 ++ PTR_ADDI S7, S7, 0x04 ++ PTR_ADDI S8, S8, 0x04 ++ PTR_ADDI P4, P4, 0x20 ++.L_N0: ++ blt ZERO, J, .L_M8 ++ ++.L_M7: ++ andi J, M, 0x04 ++ beq ZERO, J, .L_M3 ++.L_M4: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S3, S1, T0 ++ PTR_ADD S4, S2, T0 ++ PTR_ADD S0, S3, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x80 ++ ++ PTR_SRAI I, N, 0x03 ++ beq ZERO, I, .L_M4_N7 ++.align 5 ++.L_M4_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ xvld U2, S3, 0x00 ++ xvld U3, S4, 0x00 ++ ++ GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI S3, S3, 0x20 ++ PTR_ADDI S4, S4, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M4_N8 ++.L_M4_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M4_N3 ++.L_M4_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 ++ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI S3, S3, 0x10 ++ PTR_ADDI S4, S4, 0x10 ++ PTR_ADDI P2, P2, 0x40 ++.L_M4_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M4_N1 ++.L_M4_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 ++ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI S3, S3, 0x08 ++ PTR_ADDI S4, S4, 0x08 ++ PTR_ADDI P3, P3, 0x20 ++.L_M4_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M3 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 ++ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI S3, S3, 0x04 ++ PTR_ADDI S4, S4, 0x04 ++ PTR_ADDI P4, P4, 0x10 ++.L_M3: ++ andi J, M, 0x02 ++ beq ZERO, J, .L_M1 ++.L_M2: ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ PTR_ADD S0, S0, T0 ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x40 ++ ++ PTR_SRAI I, N, 0x03 ++ beq ZERO, I, .L_M2_N7 ++.align 5 ++.L_M2_N8: ++ xvld U0, S1, 0x00 ++ xvld U1, S2, 0x00 ++ ++ GST xv, , U0, P1, 0x00, U1, P1, 0x20 ++ ++ PTR_ADDI S1, S1, 0x20 ++ PTR_ADDI S2, S2, 0x20 ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M2_N8 ++.L_M2_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M2_N3 ++.L_M2_N4: ++ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 ++ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI S2, S2, 0x10 ++ PTR_ADDI P2, P2, 0x20 ++.L_M2_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M2_N1 ++.L_M2_N2: ++ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 ++ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI S2, S2, 0x08 ++ PTR_ADDI P3, P3, 0x10 ++.L_M2_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M1 ++ ++ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 ++ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI S2, S2, 0x04 ++ PTR_ADDI P4, P4, 0x08 ++.L_M1: ++ andi J, M, 0x01 ++ beq ZERO, J, .L_M0 ++ ++ move S1, S0 ++ PTR_ADD S2, S0, TL ++ ++ move P1, P0 ++ PTR_ADDI P0, P0, 0x20 ++ ++ PTR_SRAI I, N, 0x03 ++ beq ZERO, I, .L_M1_N7 ++.align 5 ++.L_M1_N8: ++ xvld U0, S1, 0x00 ++ ++ GST xv, , U0, P1, 0x00 ++ ++ PTR_ADDI S1, S1, 0x20 ++ ++ PTR_ADDI I, I, -1 ++ PTR_ADD P1, P1, T1 ++ blt ZERO, I, .L_M1_N8 ++.L_M1_N7: ++ andi I, N, 0x04 ++ beq ZERO, I, .L_M1_N3 ++.L_M1_N4: ++ GLD v, , $vr0, S1, 0x00 ++ GST v, , $vr0, P2, 0x00 ++ PTR_ADDI S1, S1, 0x10 ++ PTR_ADDI P2, P2, 0x10 ++.L_M1_N3: ++ andi I, N, 0x02 ++ beq ZERO, I, .L_M1_N1 ++.L_M1_N2: ++ GLD f, d, $f0, S1, 0x00 ++ GST f, d, $f0, P3, 0x00 ++ PTR_ADDI S1, S1, 0x08 ++ PTR_ADDI P3, P3, 0x08 ++.L_M1_N1: ++ andi I, N, 0x01 ++ beq ZERO, I, .L_M0 ++ ++ GLD f, s, $f0, S1, 0x00 ++ GST f, s, $f0, P4, 0x00 ++ PTR_ADDI S1, S1, 0x04 ++ PTR_ADDI P4, P4, 0x04 ++.L_M0: ++ pop_if_used 23, 8 ++ jirl $r0, $r1, 0x00 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S +new file mode 100644 +index 000000000..52ffc320e +--- /dev/null ++++ b/kernel/loongarch64/sgemv_n_8_lasx.S +@@ -0,0 +1,463 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/30 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, ++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) ++ */ ++#define M $r4 ++#define N $r5 ++#define ALPHA $f0 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INC_X $r10 ++#define Y $r11 ++#define INC_Y $r6 ++ ++#define J $r12 ++#define I $r13 ++#define K $r14 ++#define Y_ORG $r15 ++#define OFFSET $r16 ++#define K_LDA $r17 ++#define M4 $r18 ++#define T0 $r19 ++#define PA0 $r20 ++#define PA1 $r23 ++#define PA2 $r24 ++#define PA3 $r25 ++#define PA4 $r26 ++#define PA5 $r27 ++#define PA6 $r28 ++#define PA7 $r29 ++ ++#define VALPHA $xr1 ++#define X0 $xr2 ++#define X1 $xr3 ++#define X2 $xr4 ++#define X3 $xr5 ++#define X4 $xr6 ++#define X5 $xr7 ++#define X6 $xr8 ++#define X7 $xr9 ++#define Y0 $xr10 ++#define A0 $xr11 ++#define A1 $xr12 ++#define A2 $xr13 ++#define A3 $xr14 ++#define A4 $xr15 ++#define A5 $xr16 ++#define A6 $xr17 ++#define A7 $xr18 ++ ++#define X0_F $f2 ++#define X1_F $f3 ++#define X2_F $f4 ++#define X3_F $f5 ++#define X4_F $f6 ++#define X5_F $f7 ++#define X6_F $f8 ++#define X7_F $f9 ++#define Y0_F $f10 ++#define A0_F $f11 ++#define A1_F $f12 ++#define A2_F $f13 ++#define A3_F $f14 ++#define A4_F $f15 ++#define A5_F $f16 ++#define A6_F $f17 ++#define A7_F $f18 ++ ++.macro SLOAD_X_8 ++ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \ ++ X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ ++ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA ++.endm ++ ++.macro SLOAD_X_8_GAP ++ xvldrepl.w X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.w X1, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X2, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X3, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X4, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X5, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X6, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X7, T0, 0x00 ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ ++ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA ++.endm ++ ++.macro SLOAD_X_4 ++ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA ++.endm ++ ++.macro SLOAD_X_4_GAP ++ xvldrepl.w X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.w X1, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X2, T0, 0x00 ++ PTR_ADD T0, T0, INC_X ++ xvldrepl.w X3, T0, 0x00 ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA ++.endm ++ ++.macro SLOAD_X_2 ++ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04 ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA ++.endm ++ ++.macro SLOAD_X_2_GAP ++ xvldrepl.w X0, X, 0x00 ++ PTR_ADD T0, X, INC_X ++ xvldrepl.w X1, T0, 0x00 ++ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA ++.endm ++ ++.macro SLOAD_X_1 ++ GLDREPL xv, w, X0, X, 0x00 ++ GMUL xvf, s, X0, X0, VALPHA ++.endm ++ ++.macro SLOAD_Y_8 ++ GLD xv, , Y0, Y, 0 ++.endm ++ ++.macro SLOAD_Y_8_GAP ++ fld.s Y0_F, Y, 0 ++ fldx.s A0_F, Y, INC_Y ++ PTR_ALSL T0, INC_Y, Y, 1 ++ fld.s A1_F, T0, 0 ++ fldx.s A2_F, T0, INC_Y ++ PTR_ALSL T0, INC_Y, Y, 2 ++ fld.s A3_F, T0, 0 ++ fldx.s A4_F, T0, INC_Y ++ PTR_ADD T0, T0, INC_Y ++ PTR_ADD T0, T0, INC_Y ++ fld.s A5_F, T0, 0 ++ fldx.s A6_F, T0, INC_Y ++ GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \ ++ Y0, A4, 5, Y0, A5, 6, Y0, A6, 7 ++.endm ++ ++.macro SLOAD_Y_1 ++ GLD f, s, Y0_F, Y, 0 ++.endm ++ ++.macro SGEMV_N_8x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0, \ ++ A2, PA2, 0, A3, PA3, 0, \ ++ A4, PA4, 0, A5, PA5, 0, \ ++ A6, PA6, 0, A7, PA7, 0 ++ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ ++ Y0, A2, X2, Y0, Y0, A3, X3, Y0, \ ++ Y0, A4, X4, Y0, Y0, A5, X5, Y0, \ ++ Y0, A6, X6, Y0, Y0, A7, X7, Y0 ++.endm ++ ++.macro SGEMV_N_1x8 ++ GLD_INC f, s, 0x04, \ ++ A0_F, PA0, 0, A1_F, PA1, 0, \ ++ A2_F, PA2, 0, A3_F, PA3, 0, \ ++ A4_F, PA4, 0, A5_F, PA5, 0, \ ++ A6_F, PA6, 0, A7_F, PA7, 0 ++ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ ++ Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \ ++ Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \ ++ Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F ++.endm ++ ++.macro SGEMV_N_8x4 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0, \ ++ A2, PA2, 0, A3, PA3, 0 ++ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ ++ Y0, A2, X2, Y0, Y0, A3, X3, Y0 ++.endm ++ ++.macro SGEMV_N_1x4 ++ GLD_INC f, s, 0x04, \ ++ A0_F, PA0, 0, A1_F, PA1, 0, \ ++ A2_F, PA2, 0, A3_F, PA3, 0 ++ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ ++ Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F ++.endm ++ ++.macro SGEMV_N_8x2 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0 ++ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0 ++.endm ++ ++.macro SGEMV_N_1x2 ++ GLD_INC f, s, 0x04, \ ++ A0_F, PA0, 0, A1_F, PA1, 0 ++ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F ++.endm ++ ++.macro SGEMV_N_1x1 ++ GLD_INC f, s, 0x04, A0_F, PA0, 0 ++ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F ++.endm ++ ++.macro SSTORE_Y_8 ++ GST xv, , Y0, Y, 0 ++.endm ++ ++.macro SSTORE_Y_8_GAP ++ xvstelm.w Y0, Y, 0, 0 ++ PTR_ADD T0, Y, INC_Y ++ xvstelm.w Y0, T0, 0, 1 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 2 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 3 ++ ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 4 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 5 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 6 ++ PTR_ADD T0, T0, INC_Y ++ xvstelm.w Y0, T0, 0, 7 ++.endm ++ ++.macro SSTORE_Y_1 ++ GST f, s, Y0_F, Y, 0 ++.endm ++ ++.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req ++ PTR_SRLI J, N, 3 ++ beqz J, .L_\XW\()_N_7 ++ PTR_SLLI K_LDA, LDA, 3 ++ PTR_SUB K_LDA, K_LDA, M4 ++.L_\XW\()_N_L8: ++ SLOAD_\X_8 ++ xor K, K, K ++ move Y, Y_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_M_7 ++.align 5 ++.L_\XW\()_M_L8: ++ SLOAD_\Y_8 ++ SGEMV_N_8x8 ++ SSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ PTR_ADDI K, K, 8 ++ bnez I, .L_\XW\()_M_L8 ++.L_\XW\()_M_7: ++ andi I, M, 7 ++ beqz I, .L_\XW\()_M_END ++.align 5 ++.L_\XW\()_M_L1: ++ SLOAD_\Y_1 ++ SGEMV_N_1x8 ++ SSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_M_L1 ++.L_\XW\()_M_END: ++ PTR_ADDI J, J, -1 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#endif ++ PTR_ALSL X, INC_X, X, 3 ++ bnez J, .L_\XW\()_N_L8 ++.L_\XW\()_N_7: ++ andi J, N, 4 ++ beqz J, .L_\XW\()_N_3 ++ SLOAD_\X_4 ++ xor K, K, K ++ move Y, Y_ORG ++ ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_7 ++.align 5 ++.L_\XW\()_N_4_M_L8: ++ SLOAD_\Y_8 ++ SGEMV_N_8x4 ++ SSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI K, K, 8 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez I, .L_\XW\()_N_4_M_L8 ++.L_\XW\()_N_4_M_7: ++ andi I, M, 7 ++ beqz I, .L_\XW\()_N_4_M_END ++.align 5 ++.L_\XW\()_N_4_M_L1: ++ SLOAD_\Y_1 ++ SGEMV_N_1x4 ++ SSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_4_M_L1 ++.L_\XW\()_N_4_M_END: ++ PTR_SLLI K_LDA, LDA, 2 ++ PTR_SUB K_LDA, K_LDA, M4 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#endif ++ PTR_ALSL X, INC_X, X, 2 ++.L_\XW\()_N_3: ++ andi J, N, 2 ++ beqz J, .L_\XW\()_N_1 ++ SLOAD_\X_2 ++ xor K, K, K ++ move Y, Y_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_7 ++.align 5 ++.L_\XW\()_N_2_M_L8: ++ SLOAD_\Y_8 ++ SGEMV_N_8x2 ++ SSTORE_\Y_8 ++ PTR_ADDI I, I, -1 ++ PTR_ADDI K, K, 8 ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez I, .L_\XW\()_N_2_M_L8 ++.L_\XW\()_N_2_M_7: ++ andi I, M, 7 ++ beqz I, .L_\XW\()_N_2_M_END ++.align 5 ++.L_\XW\()_N_2_M_L1: ++ SLOAD_\Y_1 ++ SGEMV_N_1x2 ++ SSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_2_M_L1 ++.L_\XW\()_N_2_M_END: ++ PTR_SLLI K_LDA, LDA, 1 ++ PTR_SUB K_LDA, K_LDA, M4 ++ PTR_ADD PA0, PA0, K_LDA ++ PTR_ADD PA1, PA1, K_LDA ++ PTR_ALSL X, INC_X, X, 1 ++.L_\XW\()_N_1: ++ andi J, N, 1 ++ beqz J, .L_END ++ SLOAD_\X_1 ++ xor K, K, K ++ move Y, Y_ORG ++ move I, M ++ beqz I, .L_END ++.align 5 ++.L_\XW\()_N_1_M_L1: ++ SLOAD_\Y_1 ++ SGEMV_N_1x1 ++ SSTORE_\Y_1 ++ PTR_ADDI I, I, -1 ++ PTR_ADD Y, Y, INC_Y ++ PTR_ADDI K, K, 1 ++ bnez I, .L_\XW\()_N_1_M_L1 ++ b .L_END ++.endm ++ ++ PROLOGUE ++ PTR_LD INC_Y, $sp, 0 ++ push_if_used 17 + 7, 19 ++ PTR_ADDI K, $r0, 0x01 ++ PTR_SUB I, INC_X, K ++ PTR_SUB J, INC_Y, K ++ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ ++ maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ ++ PTR_ALSL I, I, J, 1 ++ GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 ++ xvreplve0.w VALPHA, $xr0 ++ move Y_ORG, Y ++ move PA0, A ++#if __loongarch_grlen == 64 ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#else ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#endif ++ la.local T0, .L_GAP_TABLE ++ PTR_ALSL I, I, T0, 1 ++ ld.h K, I, 0 ++ PTR_ADD T0, T0, K ++ jirl $r0, T0, 0 ++.L_GAP_TABLE: ++ .hword .L_GAP_0_0 - .L_GAP_TABLE ++ .hword .L_GAP_0_1 - .L_GAP_TABLE ++ .hword .L_GAP_1_0 - .L_GAP_TABLE ++ .hword .L_GAP_1_1 - .L_GAP_TABLE ++.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ ++ SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 ++.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ ++ SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 ++.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ ++ SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 ++.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ ++ SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 ++.L_END: ++ pop_if_used 17 + 7, 19 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S +new file mode 100644 +index 000000000..f4bfffb42 +--- /dev/null ++++ b/kernel/loongarch64/sgemv_t_8_lasx.S +@@ -0,0 +1,405 @@ ++/******************************************************************************* ++Copyright (c) 2023, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*******************************************************************************/ ++#define ASSEMBLER ++ ++#include "common.h" ++#include "loongarch64_asm.S" ++ ++/********************************************************************* ++* 2023/08/30 guxiwei ++* UTEST : OK ++* CTEST : OK ++* TEST : OK ++* ++* ++*********************************************************************/ ++ ++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, ++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) ++ */ ++#define M $r4 ++#define N $r5 ++#define ALPHA $f0 ++#define A $r7 ++#define LDA $r8 ++#define X $r9 ++#define INC_X $r10 ++#define Y $r11 ++#define INC_Y $r6 ++ ++#define J $r12 ++#define I $r13 ++#define K $r14 ++#define PY0 $r14 ++#define X_ORG $r15 ++#define PY1 $r16 ++#define K_LDA $r17 ++#define PY2 $r18 ++#define T0 $r19 ++#define PA0 $r20 ++#define PA1 $r23 ++#define PA2 $r24 ++#define PA3 $r25 ++#define PA4 $r26 ++#define PA5 $r27 ++#define PA6 $r28 ++#define PA7 $r29 ++#define M4 $r30 ++ ++#define VALPHA $xr0 ++#define X0 $xr1 ++#define A0 $xr2 ++#define A1 $xr3 ++#define A2 $xr4 ++#define A3 $xr5 ++#define A4 $xr6 ++#define A5 $xr7 ++#define A6 $xr8 ++#define A7 $xr9 ++#define TP0 $xr10 ++#define TP1 $xr11 ++#define TP2 $xr12 ++#define TP3 $xr13 ++#define TP4 $xr14 ++#define TP5 $xr15 ++#define TP6 $xr16 ++#define TP7 $xr17 ++#define Y0 $xr2 ++#define Y1 $xr3 ++#define Y2 $xr4 ++#define Y3 $xr5 ++#define Y4 $xr6 ++#define Y5 $xr7 ++#define Y6 $xr8 ++#define Y7 $xr9 ++ ++.macro ZERO_Y8 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ ++ TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 ++.endm ++ ++.macro ZERO_Y4 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 ++.endm ++ ++.macro ZERO_Y2 ++ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 ++.endm ++ ++.macro ZERO_Y1 ++ GXOR xv, v, TP0, TP0, TP0 ++.endm ++ ++.macro SLOAD_X8 ++ GLD xv, , X0, X, 0x00 ++.endm ++ ++.macro SLOAD_X8_GAP ++ fld.s $f1, X, 0x00 ++ fldx.s $f2, X, INC_X ++ PTR_ALSL T0, INC_X, X, 1 ++ fld.s $f3, T0, 0x00 ++ fldx.s $f4, T0, INC_X ++ GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3 ++ PTR_ALSL T0, INC_X, X, 2 ++ fld.s $f2, T0, 0x00 ++ fldx.s $f3, T0, INC_X ++ PTR_ALSL T0, INC_X, T0, 1 ++ fld.s $f4, T0, 0x00 ++ fldx.s $f5, T0, INC_X ++ GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7 ++.endm ++ ++.macro SGEMV_T_8x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0, \ ++ A2, PA2, 0, A3, PA3, 0, \ ++ A4, PA4, 0, A5, PA5, 0, \ ++ A6, PA6, 0, A7, PA7, 0 ++ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ ++ TP2, A2, X0, TP2, TP3, A3, X0, TP3, \ ++ TP4, A4, X0, TP4, TP5, A5, X0, TP5, \ ++ TP6, A6, X0, TP6, TP7, A7, X0, TP7 ++.endm ++ ++.macro SGEMV_T_4x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0, \ ++ A2, PA2, 0, A3, PA3, 0 ++ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ ++ TP2, A2, X0, TP2, TP3, A3, X0, TP3 ++.endm ++ ++.macro SGEMV_T_2x8 ++ GLD_INC xv, , 0x20, \ ++ A0, PA0, 0, A1, PA1, 0 ++ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1 ++.endm ++ ++.macro SGEMV_T_LASX XW:req X8:req, X4:req ++ PTR_SRLI J, N, 3 ++ beqz J, .L_\XW\()_N_7 ++ PTR_SLLI K_LDA, LDA, 3 ++ PTR_SUB K_LDA, K_LDA, M4 ++.L_\XW\()_N_L8: ++ ZERO_Y8 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_M_7 ++.align 5 ++.L_\XW\()_M_L8: ++ SLOAD_\X8 ++ SGEMV_T_8x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_M_L8 ++.L_\XW\()_M_7: ++ // Accumulated ++ GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ ++ Y5, TP5, Y6, TP6, Y7, TP7 ++ andi I, M, 7 ++ beqz I, .L_\XW\()_M_END ++.align 5 ++.L_\XW\()_M_L1: ++ fld.s $f1, X, 0x00 ++ fld.s $f10, PA0, 0x00 ++ fld.s $f11, PA1, 0x00 ++ fld.s $f12, PA2, 0x00 ++ fld.s $f13, PA3, 0x00 ++ fld.s $f14, PA4, 0x00 ++ fld.s $f15, PA5, 0x00 ++ fld.s $f16, PA6, 0x00 ++ fld.s $f17, PA7, 0x00 ++#if __loongarch_grlen == 64 ++ GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ ++ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 ++#elif __loongarch_grlen == 32 ++ GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ ++ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 ++#else ++ GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ ++ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 ++#endif ++ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \ ++ $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_M_L1 ++.L_\XW\()_M_END: ++ fld.s $f10, Y, 0x00 ++ fldx.s $f11, Y, INC_Y ++ PTR_ALSL PY0, INC_Y, Y, 1 ++ fld.s $f12, PY0, 0x00 ++ fldx.s $f13, PY0, INC_Y ++ PTR_ALSL PY1, INC_Y, Y, 2 ++ fld.s $f14, PY1, 0x00 ++ fldx.s $f15, PY1, INC_Y ++ PTR_ALSL PY2, INC_Y, PY1, 1 ++ fld.s $f16, PY2, 0x00 ++ fldx.s $f17, PY2, INC_Y ++ ++ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \ ++ $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17 ++ ++ PTR_ADDI J, J, -1 ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ ++ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA ++#endif ++ fst.s $f10, Y, 0x00 ++ fstx.s $f11, Y, INC_Y ++ fst.s $f12, PY0, 0x00 ++ fstx.s $f13, PY0, INC_Y ++ fst.s $f14, PY1, 0x00 ++ fstx.s $f15, PY1, INC_Y ++ fst.s $f16, PY2, 0x00 ++ fstx.s $f17, PY2, INC_Y ++ ++ PTR_ALSL Y, INC_Y, Y, 3 ++ bnez J, .L_\XW\()_N_L8 ++.L_\XW\()_N_7: ++ andi J, N, 4 ++ beqz J, .L_\XW\()_N_3 ++ ZERO_Y4 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_4_M_7 ++.align 5 ++.L_\XW\()_N_4_M_L8: ++ SLOAD_\X8 ++ SGEMV_T_4x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_N_4_M_L8 ++.L_\XW\()_N_4_M_7: ++ // Accumulated ++ GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 ++ andi I, M, 7 ++ beqz I, .L_\XW\()_N_4_M_END ++.align 5 ++.L_\XW\()_N_4_M_L1: ++ fld.s $f1, X, 0x00 ++ GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00 ++ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_N_4_M_L1 ++.L_\XW\()_N_4_M_END: ++ fld.s $f10, Y, 0x00 ++ fldx.s $f11, Y, INC_Y ++ PTR_ALSL PY0, INC_Y, Y, 1 ++ fld.s $f12, PY0, 0x00 ++ fldx.s $f13, PY0, INC_Y ++ ++ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13 ++ ++ PTR_SLLI K_LDA, LDA, 2 ++ PTR_SUB K_LDA, K_LDA, M4 ++ ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA ++#endif ++ fst.s $f10, Y, 0x00 ++ fstx.s $f11, Y, INC_Y ++ fst.s $f12, PY0, 0x00 ++ fstx.s $f13, PY0, INC_Y ++ PTR_ALSL Y, INC_Y, Y, 2 ++.L_\XW\()_N_3: ++ andi J, N, 2 ++ beqz J, .L_\XW\()_N_1 ++ ZERO_Y2 ++ move X, X_ORG ++ PTR_SRLI I, M, 3 ++ beqz I, .L_\XW\()_N_2_M_7 ++.align 5 ++.L_\XW\()_N_2_M_L8: ++ SLOAD_\X8 ++ SGEMV_T_2x8 ++ PTR_ADDI I, I, -1 ++ PTR_ALSL X, INC_X, X, 3 ++ bnez I, .L_\XW\()_N_2_M_L8 ++.L_\XW\()_N_2_M_7: ++ // Accumulated ++ GACC xvf, s, Y0, TP0, Y1, TP1 ++ andi I, M, 7 ++ beqz I, .L_\XW\()_N_2_M_END ++.align 5 ++.L_\XW\()_N_2_M_L1: ++ fld.s $f1, X, 0x00 ++ GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00 ++ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ bnez I, .L_\XW\()_N_2_M_L1 ++.L_\XW\()_N_2_M_END: ++ fld.s $f10, Y, 0x00 ++ fldx.s $f11, Y, INC_Y ++ ++ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11 ++ ++ PTR_SLLI K_LDA, LDA, 1 ++ PTR_SUB K_LDA, K_LDA, M4 ++ ++#if __loongarch_grlen == 64 ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#else ++ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA ++#endif ++ fst.s $f10, Y, 0x00 ++ fstx.s $f11, Y, INC_Y ++ PTR_ALSL Y, INC_Y, Y, 1 ++.L_\XW\()_N_1: ++ andi J, N, 1 ++ beqz J, .L_END ++ ZERO_Y1 ++ move X, X_ORG ++ move I, M ++ beqz I, .L_END ++.align 5 ++.L_\XW\()_N_1_M_L1: ++ fld.s $f2, PA0, 0x00 ++ fld.s $f1, X, 0x00 ++ fmadd.s $f10, $f2, $f1, $f10 ++ PTR_ADDI I, I, -1 ++ PTR_ADD X, X, INC_X ++ PTR_ADDI PA0, PA0, 0x04 ++ bnez I, .L_\XW\()_N_1_M_L1 ++ ++ fld.s $f2, Y, 0x00 ++ fmadd.s $f2, ALPHA, $f10, $f2 ++ fst.s $f2, Y, 0x00 ++ b .L_END ++.endm ++ ++ PROLOGUE ++ PTR_LD INC_Y, $sp, 0 ++ push_if_used 17 + 8, 18 ++ PTR_ADDI K, $r0, 0x01 ++ PTR_SUB I, INC_X, K ++ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ ++ GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 ++ xvreplve0.w VALPHA, $xr0 ++ move X_ORG, X ++ move PA0, A ++#if __loongarch_grlen == 64 ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#elif __loongarch_grlen == 32 ++ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#else ++ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ ++ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA ++#endif ++ la.local T0, .L_GAP_TABLE ++ PTR_ALSL I, I, T0, 1 ++ ld.h K, I, 0 ++ PTR_ADD T0, T0, K ++ jirl $r0, T0, 0 ++.L_GAP_TABLE: ++ .hword .L_GAP_0 - .L_GAP_TABLE ++ .hword .L_GAP_1 - .L_GAP_TABLE ++.L_GAP_0: /* if (incx == 1) */ ++ SGEMV_T_LASX GAP_0, X8, X4 ++.L_GAP_1: /* if (incx != 1) */ ++ SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP ++.L_END: ++ pop_if_used 17 + 8, 18 ++ jirl $r0, $r1, 0x0 ++ EPILOGUE +diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile +index 71e5a87cb..1c85667ec 100644 +--- a/lapack/laswp/loongarch64/Makefile ++++ b/lapack/laswp/loongarch64/Makefile +@@ -1,6 +1,11 @@ + TOPDIR = ../../.. + include ../../../Makefile.system + ++ifeq ($(DYNAMIC_ARCH), 1) ++LASWP = ../generic/laswp_k_4.c ++ZLASWP = ../generic/zlaswp_k_4.c ++endif ++ + ifndef LASWP + LASWP = ../generic/laswp_k.c + endif +diff --git a/param.h b/param.h +index f1f5cbdad..a34e806c0 100644 +--- a/param.h ++++ b/param.h +@@ -2845,31 +2845,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #define GEMM_DEFAULT_OFFSET_B 0 + #define GEMM_DEFAULT_ALIGN 0x0ffffUL + ++#if defined(NO_LASX) ++#define DGEMM_DEFAULT_UNROLL_N 8 ++#define DGEMM_DEFAULT_UNROLL_M 2 + #define SGEMM_DEFAULT_UNROLL_N 8 ++#define SGEMM_DEFAULT_UNROLL_M 2 ++#else + #define DGEMM_DEFAULT_UNROLL_N 4 ++#define DGEMM_DEFAULT_UNROLL_M 16 ++#define SGEMM_DEFAULT_UNROLL_N 8 ++#define SGEMM_DEFAULT_UNROLL_M 16 ++#endif ++ + #define QGEMM_DEFAULT_UNROLL_N 2 + #define CGEMM_DEFAULT_UNROLL_N 4 + #define ZGEMM_DEFAULT_UNROLL_N 4 + #define XGEMM_DEFAULT_UNROLL_N 1 + +-#define SGEMM_DEFAULT_UNROLL_M 2 +-#define DGEMM_DEFAULT_UNROLL_M 16 + #define QGEMM_DEFAULT_UNROLL_M 2 + #define CGEMM_DEFAULT_UNROLL_M 1 + #define ZGEMM_DEFAULT_UNROLL_M 1 + #define XGEMM_DEFAULT_UNROLL_M 1 + +-#define SGEMM_DEFAULT_P 512 ++#define SGEMM_DEFAULT_P 256 + #define DGEMM_DEFAULT_P 32 + #define CGEMM_DEFAULT_P 128 + #define ZGEMM_DEFAULT_P 128 + +-#define SGEMM_DEFAULT_R 12288 ++#define SGEMM_DEFAULT_R 1024 + #define DGEMM_DEFAULT_R 858 + #define CGEMM_DEFAULT_R 4096 + #define ZGEMM_DEFAULT_R 4096 + +-#define SGEMM_DEFAULT_Q 128 ++#define SGEMM_DEFAULT_Q 256 + #define DGEMM_DEFAULT_Q 152 + #define CGEMM_DEFAULT_Q 128 + #define ZGEMM_DEFAULT_Q 128 +-- +2.20.1 + diff --git a/OpenBLAS-0.3.23.tar.gz b/OpenBLAS-0.3.23.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..31bec85b5297b69820ba189c120feebc730345dd Binary files /dev/null and b/OpenBLAS-0.3.23.tar.gz differ diff --git a/openblas.spec b/openblas.spec new file mode 100644 index 0000000000000000000000000000000000000000..b201ae57aea0da4cba63e287043f3cd98b0d76c0 --- /dev/null +++ b/openblas.spec @@ -0,0 +1,540 @@ +%define anolis_release 2 + +%bcond_with system_lapack +%global lapackver 3.9.1 + +Name: openblas +Summary: An optimized BLAS library based on GotoBLAS2 +Version: 0.3.23 +Release: %{anolis_release}%{?dist} +License: BSD +URL: https://github.com/xianyi/OpenBLAS/ +Source0: https://github.com/xianyi/OpenBLAS/releases/download/v%{version}/OpenBLAS-%{version}.tar.gz +Patch0001: 0001-openblas-0.2.15-system_lapack.patch +Patch0002: 0002-openblas-0.2.5-libname.patch +Patch0003: 0003-openblas-0.3.11-tests.patch +Patch0004: 0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch + +BuildRequires: make gcc gcc-c++ gcc-gfortran +BuildRequires: perl-devel +Obsoletes: %{name}-Rblas < %{EVR} + +%global execstack 1 +%if %{execstack} +BuildRequires: execstack +%endif + +%if %{with system_lapack} +BuildRequires: lapack-static +%global lapacke 1 + +%else +%global lapacke 1 +Provides: bundled(lapack) = %{lapackver} +%endif + +%global build64 1 +%bcond_without cpp_thread_check + +%if %{with system_lapack} +%if %build64 +BuildRequires: lapack64-static +%endif +%endif + +%description +OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. + +For a general introduction to the BLAS routines, please refer to the extensive documentation of +their reference implementation hosted at netlib: https://www.netlib.org/blas. On that site you will +likewise find documentation for the reference implementation of the higher-level library LAPACK - +the Linear Algebra Package that comes included with OpenBLAS. If you are looking for a general primer +or refresher on Linear Algebra, the set of six 20-minute lecture videos by Prof. Gilbert Strang on either +MIT OpenCourseWare https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/ or +Youtube https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek may be helpful. + +%package serial +Summary: An optimized BLAS library based on GotoBLAS2, serial version +Requires: %{name} = %{EVR} + +%description serial +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the sequential library compiled with a 32-bit +integer interface. + +%package openmp +Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version +Requires: %{name} = %{EVR} + +%description openmp +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with OpenMP support with +32-bit integer interface. + +%package threads +Summary: An optimized BLAS library based on GotoBLAS2, pthreads version +Requires: %{name} = %{EVR} + +%description threads +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with threading support and +a 32-bit integer interface. + +%if %build64 +%package serial64 +Summary: An optimized BLAS library based on GotoBLAS2, serial version +Requires: %{name} = %{EVR} + +%description serial64 +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the sequential library compiled with a 64-bit +integer interface. + +%package openmp64 +Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version +Requires: %{name} = %{EVR} + +%description openmp64 +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with OpenMP support and +64-bit integer interface. + +%package threads64 +Summary: An optimized BLAS library based on GotoBLAS2, pthreads version +Requires: %{name} = %{EVR} + +%description threads64 +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with threading support and +64-bit integer interface. + +%package serial64_ +Summary: An optimized BLAS library based on GotoBLAS2, serial version +Requires: %{name} = %{EVR} + +%description serial64_ +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the sequential library compiled with a 64-bit +integer interface and a symbol name suffix. + +%package openmp64_ +Summary: An optimized BLAS library based on GotoBLAS2, OpenMP version +Requires: %{name} = %{EVR} + +%description openmp64_ +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with OpenMP support and +64-bit integer interface and a symbol name suffix. + +%package threads64_ +Summary: An optimized BLAS library based on GotoBLAS2, pthreads version +Requires: %{name} = %{EVR} + +%description threads64_ +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the library compiled with threading support and +64-bit integer interface and a symbol name suffix. +%endif + +%package devel +Summary: Development headers and libraries for OpenBLAS +Requires: %{name} = %{EVR} +Requires: %{name}-serial = %{EVR} +Requires: %{name}-openmp = %{EVR} +Requires: %{name}-threads = %{EVR} +%if %build64 +Requires: %{name}-serial64 = %{EVR} +Requires: %{name}-serial64_ = %{EVR} +Requires: %{name}-openmp64 = %{EVR} +Requires: %{name}-threads64 = %{EVR} +Requires: %{name}-openmp64_ = %{EVR} +Requires: %{name}-threads64_ = %{EVR} +%endif +Requires: %{name}-srpm-macros + +%description devel +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the development headers and libraries. + +%package static +Summary: Static version of OpenBLAS +Requires: %{name}-devel = %{EVR} + +%description static +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \ +version. The project is supported by the Lab of Parallel Software and \ +Computational Science, ISCAS. http://www.rdcps.ac.cn + +This package contains the static libraries. + +%prep +%setup -q -c -T + +tar zxf %{SOURCE0} +cd OpenBLAS-%{version} +%if %{with system_lapack} +%patch0001 -p1 -b .system_lapack +%endif +%patch0002 -p1 -b .libname +%patch0003 -p1 -b .tests +%patch0004 -p1 -b .Add-opt-for-LoongArch64 + +find -name \*.f -exec chmod 644 {} \; + +%if %{with system_lapack} +rm -rf lapack-netlib +%endif + +cd .. +cp -ar OpenBLAS-%{version} openmp +cp -ar OpenBLAS-%{version} threaded + +%if %build64 +for d in {serial,threaded,openmp}64{,_}; do + cp -ar OpenBLAS-%{version} $d +done +%endif + +mv OpenBLAS-%{version} serial + +%if %{with system_lapack} +mkdir netliblapack +cd netliblapack +ar x %{_libdir}/liblapack_pic.a +for f in laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs; do + \rm {c,d,s,z}$f.o +done + +%if %{lapacke} +ar x %{_libdir}/liblapacke.a +%endif + +echo "TOPDIR = .." > Makefile +echo "include ../Makefile.system" >> Makefile +echo "COMMONOBJS = \\" >> Makefile +for i in *.o; do + echo "$i \\" >> Makefile +done +echo -e "\n\ninclude \$(TOPDIR)/Makefile.tail" >> Makefile + +%if %{lapacke} +# Copy include files +cp -a %{_includedir}/lapacke . +%endif +cd .. + +for d in serial threaded openmp; do + cp -pr netliblapack $d +done +rm -rf netliblapack + +%if %build64 +mkdir netliblapack64 +cd netliblapack64 +ar x %{_libdir}/liblapack64_pic.a +for f in laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs; do + \rm {c,d,s,z}$f.o +done + +%if %{lapacke} +ar x %{_libdir}/liblapacke.a +%endif + +echo "TOPDIR = .." > Makefile +echo "include ../Makefile.system" >> Makefile +echo "COMMONOBJS = \\" >> Makefile +for i in *.o; do + echo "$i \\" >> Makefile +done +echo -e "\n\ninclude \$(TOPDIR)/Makefile.tail" >> Makefile + +%if %{lapacke} +# Copy include files +cp -a %{_includedir}/lapacke . +%endif + +cd .. + +for d in {serial,threaded,openmp}64{,_}; do + cp -pr netliblapack64 $d/netliblapack +done +rm -rf netliblapack64 +%endif +%endif + +%build +%define _lto_cflags %{nil} +%if !%{lapacke} +LAPACKE="NO_LAPACKE=1" +%endif + +NMAX="NUM_THREADS=128" + +%ifarch x86_64 +TARGET="TARGET=CORE2 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" +%endif +%ifarch aarch64 +TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1" +%endif +%ifarch loongarch64 +TARGET="TARGET=LOONGSONGENERIC DYNAMIC_ARCH=1" +%endif + +COMMON="%{optflags} -fPIC" +FCOMMON="%{optflags} -fPIC -frecursive" +export LDFLAGS="%{__global_ldflags}" + +COMMON="%{optflags} -fPIC" +FCOMMON="$COMMON -frecursive" +make -C serial $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas" $AVX $LAPACKE INTERFACE64=0 +make -C threaded $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp" $AVX $LAPACKE INTERFACE64=0 + +COMMON="%{optflags} -fPIC -fopenmp -pthread" +FCOMMON="$COMMON -frecursive" +make -C openmp $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso" $AVX $LAPACKE INTERFACE64=0 %{with cpp_thread_check:CPP_THREAD_SAFETY_TEST=1} + +%if %build64 +COMMON="%{optflags} -fPIC" +FCOMMON="$COMMON -frecursive -fdefault-integer-8" +make -C serial64 $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64" $AVX $LAPACKE INTERFACE64=1 +make -C threaded64 $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64" $AVX $LAPACKE INTERFACE64=1 + +COMMON="%{optflags} -fPIC -fopenmp -pthread" +FCOMMON="$COMMON -frecursive -fdefault-integer-8" +make -C openmp64 $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64" $AVX $LAPACKE INTERFACE64=1 CPP_THREAD_SAFETY_TEST=1 + +COMMON="%{optflags} -fPIC" +FCOMMON="$COMMON -frecursive -fdefault-integer-8" +make -C serial64_ $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ +make -C threaded64_ $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ + +COMMON="%{optflags} -fPIC -fopenmp -pthread" +FCOMMON="$COMMON -frecursive -fdefault-integer-8" +make -C openmp64_ $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ CPP_THREAD_SAFETY_TEST=1 +%endif + +%install +make -C serial USE_THREAD=0 PREFIX=%{buildroot} OPENBLAS_LIBRARY_DIR=%{buildroot}%{_libdir} OPENBLAS_INCLUDE_DIR=%{buildroot}%{_includedir}/%name OPENBLAS_BINARY_DIR=%{buildroot}%{_bindir} OPENBLAS_CMAKE_DIR=%{buildroot}%{_libdir}/cmake install + +%if %{with system_lapack} && %{lapacke} +cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name} +%endif + +suffix="" +slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so` +mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a +if [[ "$suffix" != "" ]]; then + sname=$(echo $slibname | sed "s|$suffix||g") + mv %{buildroot}%{_libdir}/${slibname}.so %{buildroot}%{_libdir}/${sname}.so +else + sname=${slibname} +fi + +olibname=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o|g"` +install -D -pm 644 openmp/${olibname}.a %{buildroot}%{_libdir}/lib%{name}o.a +if [[ "$suffix" != "" ]]; then + oname=$(echo $olibname | sed "s|$suffix||g") +else + oname=${olibname} +fi +install -D -pm 755 openmp/${olibname}.so %{buildroot}%{_libdir}/${oname}.so + +plibname=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p|g"` +install -D -pm 644 threaded/${plibname}.a %{buildroot}%{_libdir}/lib%{name}p.a +if [[ "$suffix" != "" ]]; then + pname=$(echo $plibname | sed "s|$suffix||g") +else + pname=${plibname} +fi +install -D -pm 755 threaded/${plibname}.so %{buildroot}%{_libdir}/${pname}.so + +%if %build64 +slibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64|g"` +install -D -pm 644 serial64/${slibname64}.a %{buildroot}%{_libdir}/lib%{name}64.a +slibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64_|g"` +install -D -pm 644 serial64_/${slibname64_}.a %{buildroot}%{_libdir}/lib%{name}64_.a + +if [[ "$suffix" != "" ]]; then + sname64=$(echo ${slibname64} | sed "s|$suffix||g") + sname64_=$(echo ${slibname64_} | sed "s|$suffix||g") +else + sname64=${slibname64} + sname64_=${slibname64_} +fi +install -D -pm 755 serial64/${slibname64}.so %{buildroot}%{_libdir}/${sname64}.so +install -D -pm 755 serial64_/${slibname64_}.so %{buildroot}%{_libdir}/${sname64_}.so + +olibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o64|g"` +install -D -pm 644 openmp64/${olibname64}.a %{buildroot}%{_libdir}/lib%{name}o64.a +olibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o64_|g"` +install -D -pm 644 openmp64_/${olibname64_}.a %{buildroot}%{_libdir}/lib%{name}o64_.a + +if [[ "$suffix" != "" ]]; then + oname64=$(echo ${olibname64} | sed "s|$suffix||g") + oname64_=$(echo ${olibname64_} | sed "s|$suffix||g") +else + oname64=${olibname64} + oname64_=${olibname64_} +fi +install -D -pm 755 openmp64/${olibname64}.so %{buildroot}%{_libdir}/${oname64}.so +install -D -pm 755 openmp64_/${olibname64_}.so %{buildroot}%{_libdir}/${oname64_}.so + +plibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p64|g"` +install -D -pm 644 threaded64/${plibname64}.a %{buildroot}%{_libdir}/lib%{name}p64.a +plibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p64_|g"` +install -D -pm 644 threaded64_/${plibname64_}.a %{buildroot}%{_libdir}/lib%{name}p64_.a + +if [[ "$suffix" != "" ]]; then + pname64=$(echo $plibname64 | sed "s|$suffix||g") + pname64_=$(echo $plibname64_ | sed "s|$suffix||g") +else + pname64=${plibname64} + pname64_=${plibname64_} +fi +install -D -pm 755 threaded64/${plibname64}.so %{buildroot}%{_libdir}/${pname64}.so +install -D -pm 755 threaded64_/${plibname64_}.so %{buildroot}%{_libdir}/${pname64_}.so +%endif + +pushd %{buildroot}%{_libdir} +ln -sf ${sname}.so lib%{name}.so.0 +ln -sf ${sname}.so lib%{name}.so +ln -sf ${oname}.so lib%{name}o.so.0 +ln -sf ${oname}.so lib%{name}o.so +ln -sf ${pname}.so lib%{name}p.so.0 +ln -sf ${pname}.so lib%{name}p.so + +%if %build64 +ln -sf ${sname64}.so lib%{name}64.so.0 +ln -sf ${sname64}.so lib%{name}64.so +ln -sf ${sname64_}.so lib%{name}64_.so.0 +ln -sf ${sname64_}.so lib%{name}64_.so +ln -sf ${oname64}.so lib%{name}o64.so.0 +ln -sf ${oname64}.so lib%{name}o64.so +ln -sf ${oname64_}.so lib%{name}o64_.so.0 +ln -sf ${oname64_}.so lib%{name}o64_.so +ln -sf ${pname64}.so lib%{name}p64.so.0 +ln -sf ${pname64}.so lib%{name}p64.so +ln -sf ${pname64_}.so lib%{name}p64_.so.0 +ln -sf ${pname64_}.so lib%{name}p64_.so +%endif + +%if %{execstack} +for lib in %{buildroot}%{_libdir}/libopenblas*.so; do + execstack -c $lib +done +%endif + +rm -rf %{buildroot}%{_libdir}/cmake +rm -rf %{buildroot}%{_libdir}/pkgconfig + +%files +%license serial/LICENSE +%doc serial/Changelog.txt serial/GotoBLAS* + +%files serial +%{_libdir}/lib%{name}.so.* +%{_libdir}/lib%{name}-*.so + +%files openmp +%{_libdir}/lib%{name}o.so.* +%{_libdir}/lib%{name}o-*.so + +%files threads +%{_libdir}/lib%{name}p.so.* +%{_libdir}/lib%{name}p-*.so + +%if %build64 +%files serial64 +%{_libdir}/lib%{name}64.so.* +%{_libdir}/lib%{name}64-*.so + +%files openmp64 +%{_libdir}/lib%{name}o64.so.* +%{_libdir}/lib%{name}o64-*.so + +%files threads64 +%{_libdir}/lib%{name}p64.so.* +%{_libdir}/lib%{name}p64-*.so + +%files serial64_ +%{_libdir}/lib%{name}64_.so.* +%{_libdir}/lib%{name}64_-*.so + +%files openmp64_ +%{_libdir}/lib%{name}o64_.so.* +%{_libdir}/lib%{name}o64_-*.so + +%files threads64_ +%{_libdir}/lib%{name}p64_.so.* +%{_libdir}/lib%{name}p64_-*.so +%endif + +%files devel +%{_includedir}/%{name}/ +%{_libdir}/lib%{name}.so +%{_libdir}/lib%{name}p.so +%{_libdir}/lib%{name}o.so +%if %build64 +%{_libdir}/lib%{name}p64_.so +%{_libdir}/lib%{name}o64.so +%{_libdir}/lib%{name}p64.so +%{_libdir}/lib%{name}64_.so +%{_libdir}/lib%{name}o64_.so +%{_libdir}/lib%{name}64.so +%endif + +%files static +%{_libdir}/lib%{name}.a +%{_libdir}/lib%{name}o.a +%{_libdir}/lib%{name}p.a +%if %build64 +%{_libdir}/lib%{name}o64.a +%{_libdir}/lib%{name}p64.a +%{_libdir}/lib%{name}o64_.a +%{_libdir}/lib%{name}p64_.a +%{_libdir}/lib%{name}64.a +%{_libdir}/lib%{name}64_.a +%endif + +%changelog +* Tue Oct 31 2023 XiWei Gu - 0.3.23-2 +- Add opt for LoongArch64 + +* Sun Apr 16 2023 Funda Wang - 0.3.23-1 +- New version 0.3.23 + +* Sat Apr 15 2023 Heng Qi - 0.3.21-2 +- Refactor the spec file + +* Fri Jan 27 2023 Funda Wang - 0.3.21-1 +- Import package for anolis 23