diff --git a/0001-openblas-0.2.15-system_lapack.patch b/0001-openblas-0.2.15-system_lapack.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4b843a9a4c853a9255c78eb38dc4713f89edd705
--- /dev/null
+++ b/0001-openblas-0.2.15-system_lapack.patch
@@ -0,0 +1,87 @@
+diff -up OpenBLAS-0.2.15/Makefile.system_lapack OpenBLAS-0.2.15/Makefile
+--- OpenBLAS-0.2.15/Makefile.system_lapack	2015-10-27 13:44:50.000000000 -0700
++++ OpenBLAS-0.2.15/Makefile	2015-10-28 09:14:56.696685503 -0700
+@@ -16,11 +16,7 @@ BLASDIRS += reference
+ endif
+ 
+ SUBDIRS	= $(BLASDIRS)
+-ifneq ($(NO_LAPACK), 1)
+-SUBDIRS	+= lapack
+-endif
+-
+-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
++SUBDIRS += lapack
+ 
+ SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
+ 
+@@ -211,57 +207,8 @@ hpl_p :
+ 	fi; \
+ 	done
+ 
+-ifeq ($(NO_LAPACK), 1)
+ netlib :
+-
+-else
+-netlib : lapack_prebuild
+-ifndef NOFORTRAN
+-	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
+-	@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
+-endif
+-ifndef NO_LAPACKE
+-	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
+-endif
+-endif
+-
+-prof_lapack : lapack_prebuild
+-	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
+-
+-lapack_prebuild :
+-ifndef NOFORTRAN
+-	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "ARCHFLAGS   = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "TMGLIB      = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "BLASLIB     = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "LAPACKELIB  = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "SUFFIX      = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "PSUFFIX     = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "CEXTRALIB   = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-ifeq ($(FC), gfortran)
+-	-@echo "TIMER       = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
+-ifdef SMP
+-	-@echo "LOADER      = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
+-else
+-	-@echo "LOADER      = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-endif
+-else
+-	-@echo "TIMER       = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
+-	-@echo "LOADER      = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
+-endif
+-	-@cat  make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
+-endif
++	@$(MAKE) -C $(NETLIB_LAPACK_DIR)
+ 
+ large.tgz :
+ ifndef NOFORTRAN
+diff -up OpenBLAS-0.2.15/Makefile.system.system_lapack OpenBLAS-0.2.15/Makefile.system
+--- OpenBLAS-0.2.15/Makefile.system.system_lapack	2015-10-27 13:44:50.000000000 -0700
++++ OpenBLAS-0.2.15/Makefile.system	2015-10-28 09:14:39.994350500 -0700
+@@ -9,7 +9,7 @@ ifndef TOPDIR
+ TOPDIR = .
+ endif
+ 
+-NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
++NETLIB_LAPACK_DIR = $(TOPDIR)/netliblapack
+ 
+ # Default C compiler
+ # - Only set if not specified on the command line or inherited from the environment.
diff --git a/0002-openblas-0.2.5-libname.patch b/0002-openblas-0.2.5-libname.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e30ab8bf91ad0dfb05572b11cb21680483cce0d4
--- /dev/null
+++ b/0002-openblas-0.2.5-libname.patch
@@ -0,0 +1,24 @@
+diff -up OpenBLAS-0.2.5/Makefile.system.orig OpenBLAS-0.2.5/Makefile.system
+--- OpenBLAS-0.2.5/Makefile.system.orig	2012-11-27 01:24:53.000000000 +0200
++++ OpenBLAS-0.2.5/Makefile.system	2012-12-24 16:13:57.316689688 +0200
+@@ -758,16 +758,16 @@ ifndef SMP
+ LIBNAME		= $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
+ LIBNAME_P	= $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
+ else
+-LIBNAME		= $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX)
+-LIBNAME_P	= $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX)
++LIBNAME		= $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
++LIBNAME_P	= $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
+ endif
+ else
+ ifndef SMP
+ LIBNAME		= $(LIBPREFIX)$(REVISION).$(LIBSUFFIX)
+ LIBNAME_P	= $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX)
+ else
+-LIBNAME		= $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX)
+-LIBNAME_P	= $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
++LIBNAME		= $(LIBPREFIX)$(REVISION).$(LIBSUFFIX)
++LIBNAME_P	= $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX)
+ endif
+ endif
+ 
diff --git a/0003-openblas-0.3.11-tests.patch b/0003-openblas-0.3.11-tests.patch
new file mode 100644
index 0000000000000000000000000000000000000000..abbdf45f4aeaf952f7fb1707a8104a4892023963
--- /dev/null
+++ b/0003-openblas-0.3.11-tests.patch
@@ -0,0 +1,26 @@
+diff -up OpenBLAS-0.3.21/Makefile.fixtests OpenBLAS-0.3.21/Makefile
+--- OpenBLAS-0.3.21/Makefile.fixtests	2022-08-26 07:37:06.257272957 +0200
++++ OpenBLAS-0.3.21/Makefile	2022-08-26 07:37:53.168414307 +0200
+@@ -147,18 +147,18 @@ tests :
+ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
+ 	touch $(LIBNAME)
+ ifndef NO_FBLAS
+-	$(MAKE) -C test all
++	$(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ endif
+ endif
+ ifneq ($(ONLY_CBLAS), 1)
+-	$(MAKE) -C utest all
++	$(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ endif
+ ifneq ($(NO_CBLAS), 1)
+ ifneq ($(ONLY_CBLAS), 1)
+-	$(MAKE) -C ctest all
++	$(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ endif
+ ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
+-	$(MAKE) -C cpp_thread_test all
++	$(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ endif
+ endif
+ 
diff --git a/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch b/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a13979238ba58fe187acae043f454045975ecf54
--- /dev/null
+++ b/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch
@@ -0,0 +1,18231 @@
+From 642128b0e5f86a5bbb304350ff4826028ccd2e20 Mon Sep 17 00:00:00 2001
+From: gxw <guxiwei-hf@loongson.cn>
+Date: Fri, 11 Aug 2023 10:11:51 +0800
+Subject: [PATCH] OpenBLAS-0.3.23: Add opt for LoongArch64
+
+---
+ .github/workflows/loongarch64.yml             |  110 +
+ Makefile.system                               |   10 +-
+ c_check                                       |   35 +
+ c_check.pl                                    |   45 +
+ common_loongarch64.h                          |   13 +
+ cpuid_loongarch64.c                           |   18 +-
+ kernel/loongarch64/KERNEL.LOONGSON3R5         |   31 +-
+ kernel/loongarch64/KERNEL.generic             |    4 +
+ kernel/loongarch64/dgemm_kernel_16x4.S        | 4058 +++++++----------
+ kernel/loongarch64/dgemv_n_8_lasx.S           |  554 +++
+ kernel/loongarch64/dgemv_t_8_lasx.S           |  481 ++
+ .../loongarch64/dtrsm_kernel_LN_16x4_lasx.S   | 1366 ++++++
+ .../loongarch64/dtrsm_kernel_LT_16x4_lasx.S   |  959 ++++
+ .../loongarch64/dtrsm_kernel_RN_16x4_lasx.S   |  882 ++++
+ .../loongarch64/dtrsm_kernel_RT_16x4_lasx.S   |  953 ++++
+ kernel/loongarch64/dtrsm_kernel_macro.S       | 2147 +++++++++
+ kernel/loongarch64/loongarch64_asm.S          |  430 ++
+ kernel/loongarch64/sgemm_kernel_16x8_lasx.S   | 2348 ++++++++++
+ kernel/loongarch64/sgemm_ncopy_16_lasx.S      |  463 ++
+ kernel/loongarch64/sgemm_ncopy_8_lasx.S       |  298 ++
+ kernel/loongarch64/sgemm_tcopy_16_lasx.S      |  526 +++
+ kernel/loongarch64/sgemm_tcopy_8_lasx.S       |  406 ++
+ kernel/loongarch64/sgemv_n_8_lasx.S           |  463 ++
+ kernel/loongarch64/sgemv_t_8_lasx.S           |  405 ++
+ lapack/laswp/loongarch64/Makefile             |    5 +
+ param.h                                       |   18 +-
+ 26 files changed, 14611 insertions(+), 2417 deletions(-)
+ create mode 100644 .github/workflows/loongarch64.yml
+ create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S
+ create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S
+ create mode 100644 kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
+ create mode 100644 kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
+ create mode 100644 kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
+ create mode 100644 kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
+ create mode 100644 kernel/loongarch64/dtrsm_kernel_macro.S
+ create mode 100644 kernel/loongarch64/loongarch64_asm.S
+ create mode 100644 kernel/loongarch64/sgemm_kernel_16x8_lasx.S
+ create mode 100644 kernel/loongarch64/sgemm_ncopy_16_lasx.S
+ create mode 100644 kernel/loongarch64/sgemm_ncopy_8_lasx.S
+ create mode 100644 kernel/loongarch64/sgemm_tcopy_16_lasx.S
+ create mode 100644 kernel/loongarch64/sgemm_tcopy_8_lasx.S
+ create mode 100644 kernel/loongarch64/sgemv_n_8_lasx.S
+ create mode 100644 kernel/loongarch64/sgemv_t_8_lasx.S
+
+diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
+new file mode 100644
+index 000000000..5501e98e0
+--- /dev/null
++++ b/.github/workflows/loongarch64.yml
+@@ -0,0 +1,110 @@
++name: loongarch64 qemu test
++
++on: [push, pull_request]
++
++jobs:
++  TEST:
++    runs-on: ubuntu-latest
++    strategy:
++      fail-fast: false
++      matrix:
++        include:
++          - target: LOONGSONGENERIC
++            triple:  loongarch64-unknown-linux-gnu
++            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
++          - target: LOONGSON3R5
++            triple: loongarch64-unknown-linux-gnu
++            opts: NO_SHARED=1 TARGET=LOONGSON3R5
++          - target: LOONGSON2K1000
++            triple: loongarch64-unknown-linux-gnu
++            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
++
++    steps:
++      - name: Checkout repository
++        uses: actions/checkout@v3
++
++      - name: Install APT deps
++        run: |
++          sudo add-apt-repository ppa:savoury1/virtualisation
++          sudo apt-get update
++          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
++          qemu-user-static
++
++      - name: Download and install loongarch64-toolchain
++        run: |
++          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
++          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
++
++      - name: Set env
++        run: |
++          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
++          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
++
++      - name: Compilation cache
++        uses: actions/cache@v3
++        with:
++          path: ~/.ccache
++          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
++          restore-keys: |
++            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
++            ccache-${{ runner.os }}-${{ matrix.target }}
++
++      - name: Configure ccache
++        run: |
++          test -d ~/.ccache || mkdir -p ~/.ccache
++          echo "max_size = 300M" > ~/.ccache/ccache.conf
++          echo "compression = true" >> ~/.ccache/ccache.conf
++          ccache -s
++
++      - name: Disable utest dsdot:dsdot_n_1
++        run: |
++          echo -n > utest/test_dsdot.c
++          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
++          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
++
++      - name: Build OpenBLAS
++        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
++
++      - name: Test
++        run: |
++          qemu-loongarch64-static ./utest/openblas_utest
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
++          rm -f ./test/?BLAT2.SUMM
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
++          rm -f ./test/?BLAT2.SUMM
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
++          rm -f ./test/?BLAT3.SUMM
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
++          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
++          rm -f ./test/?BLAT3.SUMM
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
++          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
+diff --git a/Makefile.system b/Makefile.system
+index 343b94bb3..1eabff27d 100644
+--- a/Makefile.system
++++ b/Makefile.system
+@@ -932,8 +932,12 @@ BINARY_DEFINED = 1
+ endif
+ 
+ ifeq ($(ARCH), loongarch64)
+-CCOMMON_OPT += -march=loongarch64 -mabi=lp64
+-FCOMMON_OPT += -march=loongarch64 -mabi=lp64
++LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
++ifneq ($(LA64_ABI), lp64d)
++LA64_ABI=lp64
++endif
++CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
++FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
+ endif
+ 
+ endif
+@@ -1763,6 +1767,8 @@ export TARGET_CORE
+ export NO_AVX512
+ export NO_AVX2
+ export BUILD_BFLOAT16
++export NO_LSX
++export NO_LASX
+ 
+ export SBGEMM_UNROLL_M
+ export SBGEMM_UNROLL_N
+diff --git a/c_check b/c_check
+index e8f90e18a..5a7163a63 100755
+--- a/c_check
++++ b/c_check
+@@ -181,6 +181,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
+     rm -rf "$tmpd"
+ fi
+ 
++no_lsx=0
++no_lasx=0
++if [ "$architecture" = "loongarch64" ]; then
++    tmpd="$(mktemp -d)"
++    tmplsx="$tmpd/lsx.c"
++    codelsx='"vadd.b $vr0, $vr0, $vr0"'
++    lsx_flags='-march=loongarch64 -mlsx'
++    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
++    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
++    args="$lsx_flags -o $tmplsx.o $tmplsx"
++    {
++        $compiler_name $flags $args >/dev/null 2>&1
++    } || {
++        no_lsx=1
++    }
++
++    tmplasx="$tmpd/lasx.c"
++    codelasx='"xvadd.b $xr0, $xr0, $xr0"'
++    lasx_flags='-march=loongarch64 -mlasx'
++    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
++    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
++    args="$lasx_flags -o $tmplasx.o $tmplasx"
++    {
++        $compiler_name $flags $args >/dev/null 2>&1
++    } || {
++        no_lasx=1
++    }
++
++    rm -rf "$tmpd"
++fi
++
+ case "$data" in
+     *ARCH_X86_64*) architecture=x86_64 ;;
+     *ARCH_X86*) architecture=x86 ;;
+@@ -395,6 +426,8 @@ done
+     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
+     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
++    [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
++    [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
+ } >> "$makefile"
+ 
+ os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
+@@ -410,6 +443,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
+     [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
+     [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
+     [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
++    [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
++    [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
+ } >> "$config"
+ 
+ 
+diff --git a/c_check.pl b/c_check.pl
+index 6ce28e11b..7a860a211 100644
+--- a/c_check.pl
++++ b/c_check.pl
+@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
+     }
+ }
+ 
++$no_lsx = 0;
++$no_lasx = 0;
++if (($architecture eq "loongarch64")) {
++    eval "use File::Temp qw(tempfile)";
++    if ($@){
++	warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
++    } else {
++	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
++	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
++	$lsx_flags = "-march=loongarch64 -mlsx";
++	print $tmplsx "#include <lsxintrin.h>\n\n";
++	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
++
++	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
++	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
++	system(@cmd) == 0;
++	if ($? != 0) {
++	    $no_lsx = 1;
++	} else {
++	    $no_lsx = 0;
++	}
++	unlink("$tmplsx.o");
++
++	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
++	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
++	$lasx_flags = "-march=loongarch64 -mlasx";
++	print $tmplasx "#include <lasxintrin.h>\n\n";
++	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
++
++	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
++	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
++	system(@cmd) == 0;
++	if ($? != 0) {
++	    $no_lasx = 1;
++	} else {
++	    $no_lasx = 0;
++	}
++	unlink("$tmplasx.o");
++    }
++}
++
+ $architecture = x86          if ($data =~ /ARCH_X86/);
+ $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+ $architecture = e2k          if ($data =~ /ARCH_E2K/);
+@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
+ print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
+ print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
+ print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
++print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
++print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
+ 
+ $os           =~ tr/[a-z]/[A-Z]/;
+ $architecture =~ tr/[a-z]/[A-Z]/;
+@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n"  if $binformat eq bin64;
+ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
+ print CONFFILE "#define HAVE_MSA\t1\n"  if $have_msa eq 1;
+ print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
++print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
++print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
+ 
+ 
+ if ($os eq "LINUX") {
+diff --git a/common_loongarch64.h b/common_loongarch64.h
+index e15539b5f..ce1fcf091 100644
+--- a/common_loongarch64.h
++++ b/common_loongarch64.h
+@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
+   return x / y;
+ }
+ 
++#ifndef NO_AFFINITY
++static inline int WhereAmI(void){
++  int ret = 0, counter = 0;
++  __asm__ volatile (
++    "rdtimel.w  %[counter],   %[id]"
++    : [id]"=r"(ret), [counter]"=r"(counter)
++    :
++    : "memory"
++  );
++  return ret;
++}
++#endif
++
+ #ifdef DOUBLE
+ #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
+ #else
+diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
+index ca07c7ffb..7c389db27 100644
+--- a/cpuid_loongarch64.c
++++ b/cpuid_loongarch64.c
+@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************/
+ 
+ #include <stdint.h>
++#include <sys/auxv.h>
+ 
+ /*  If LASX extension instructions supported,
+  *  using core LOONGSON3R5
+@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #define CPU_LOONGSON3R5    1
+ #define CPU_LOONGSON2K1000 2
+ 
+-#define LOONGARCH_CFG2  0x02
+-#define LOONGARCH_LASX  1<<7
+-#define LOONGARCH_LSX   1<<6
++#define LA_HWCAP_LSX    (1<<4)
++#define LA_HWCAP_LASX   (1<<5)
+ 
+ static char *cpuname[] = {
+   "LOONGSONGENERIC",
+@@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
+ 
+ int detect(void) {
+ #ifdef __linux
+-  uint32_t reg = 0;
++  int flag  = (int)getauxval(AT_HWCAP);
+ 
+-  __asm__ volatile (
+-    "cpucfg %0, %1 \n\t"
+-    : "+&r"(reg)
+-    : "r"(LOONGARCH_CFG2)
+-  );
+-
+-  if (reg & LOONGARCH_LASX)
++  if (flag & LA_HWCAP_LASX)
+     return CPU_LOONGSON3R5;
+-  else if (reg & LOONGARCH_LSX)
++  else if (flag & LA_HWCAP_LSX)
+     return CPU_LOONGSON2K1000;
+   else
+     return CPU_GENERIC;
+diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
+index cda359040..011e8b89e 100644
+--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
++++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
+@@ -1,3 +1,4 @@
++ifndef NO_LASX
+ DGEMMKERNEL    = dgemm_kernel_16x4.S
+ DGEMMINCOPY    = dgemm_ncopy_16.S
+ DGEMMITCOPY    = dgemm_tcopy_16.S
+@@ -8,7 +9,29 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ 
+-DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+-DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+-DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+-DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
++DGEMVNKERNEL = dgemv_n_8_lasx.S
++DGEMVTKERNEL = dgemv_t_8_lasx.S
++
++SGEMMKERNEL  = sgemm_kernel_16x8_lasx.S
++SGEMMINCOPY  = sgemm_ncopy_16_lasx.S
++SGEMMITCOPY  = sgemm_tcopy_16_lasx.S
++SGEMMONCOPY  = sgemm_ncopy_8_lasx.S
++SGEMMOTCOPY  = sgemm_tcopy_8_lasx.S
++SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
++SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
++SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
++SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
++
++SGEMVNKERNEL = sgemv_n_8_lasx.S
++SGEMVTKERNEL = sgemv_t_8_lasx.S
++
++DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
++DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
++DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
++DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
++endif
++
++STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
++STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
++STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
++STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
+index b772a6f82..213add9ee 100644
+--- a/kernel/loongarch64/KERNEL.generic
++++ b/kernel/loongarch64/KERNEL.generic
+@@ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c
+ ZSWAPKERNEL  = ../arm/zswap.c
+ 
+ SGEMVNKERNEL = ../arm/gemv_n.c
++ifndef DGEMVNKERNEL
+ DGEMVNKERNEL = ../arm/gemv_n.c
++endif
+ CGEMVNKERNEL = ../arm/zgemv_n.c
+ ZGEMVNKERNEL = ../arm/zgemv_n.c
+ 
+ SGEMVTKERNEL = ../arm/gemv_t.c
++ifndef DGEMVTKERNEL
+ DGEMVTKERNEL = ../arm/gemv_t.c
++endif
+ CGEMVTKERNEL = ../arm/zgemv_t.c
+ ZGEMVTKERNEL = ../arm/zgemv_t.c
+ 
+diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
+index 13faa977e..f8e26fda2 100644
+--- a/kernel/loongarch64/dgemm_kernel_16x4.S
++++ b/kernel/loongarch64/dgemm_kernel_16x4.S
+@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+ #include "common.h"
+ 
++/*********************************************************************
++* 2023/06/28 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++* 2023/06/28 guxiwei
++* Parameter:
++*       DGEMM_DEFAULT_UNROLL_N  4
++*       DGEMM_DEFAULT_UNROLL_M  16
++*       DGEMM_DEFAULT_P         32
++*       DGEMM_DEFAULT_Q         152
++*       DGEMM_DEFAULT_R         858
++*       A_PR1                   1024
++*       B_PR1                   256
++*
++*
++* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
++*       1 thread:       36.0 GFLOPS
++*       2 threads:      71.6 GFLOPS
++*       3 threads:     101.5 GFLOPS
++*       4 threads:     132.8 GFLOPS
++*********************************************************************/
++
+ /* Function parameters */
+ #define M      $r4   // param 1: bm
+ #define N      $r5   // param 2: bn
+@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #define U4     $xr4
+ #define U5     $xr5
+ #define U6     $xr6
+-#define D0     $xr7
+-#define D1     $xr8
+-#define D2     $xr9
+-#define D3     $xr10
+-#define D4     $xr11
+-#define D5     $xr12
+-#define D6     $xr13
+-#define D7     $xr14
+-#define D8     $xr15
+-#define D9     $xr16
+-#define D10    $xr17
+-#define D11    $xr18
+-#define D12    $xr19
+-#define D13    $xr20
+-#define D14    $xr21
+-#define D15    $xr22
+-#define VALPHA $xr23
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++#define VALPHA $xr15
+ 
+ /* Prefetch interval */
+-#define A_PRE  0x200
++#define A_PRE  0x400
+ #define B_PRE  0x100
+ 
+-    PROLOGUE
+-
+-    addi.d   $sp,   $sp,   -56
+-    /* Store regs */
+-    SDARG    $r23,  $sp,   0
+-    SDARG    $r24,  $sp,   8
+-    SDARG    $r25,  $sp,   16
+-    SDARG    $r26,  $sp,   24
+-    SDARG    $r27,  $sp,   32
+-    ST       $f23,  $sp,   40
+-    ST       ALPHA, $sp,   48
+-
+-    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
+-    xvld         VALPHA, $sp,  48
+-    xvreplve0.d  VALPHA, VALPHA
+-
+-#if defined (TRMMKERNEL) && !defined(LEFT)
+-    sub.d   OFF,   ZERO,  OFFSET
+-#else
+-    xor     OFF,   OFF,   OFF
+-#endif
+-
+-    /* if (!(N >> 2)) goto L_N3 */
+-    srai.d   J,     N,     2     /* J = bn >> 2 */
+-    andi     N,     N,     0x03
+-    beq      ZERO,  J,     .L_N3
+-
+-.L_J1: /* J-- && This loop include Condition 1 */
+-
+-/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+-*                                                   dgemm_core_16x4                                */
+-    move     C0,    C
+-    move     A0,    A
+-    slli.d   T0,    LDC,   3
+-    add.d    C1,    C0,    T0
+-    addi.d   J,     J,     -1   /* J-- */
+-    add.d    C2,    C1,    T0
+-    add.d    C3,    C2,    T0
+-
+-#if defined(TRMMKERNEL) && defined(LEFT)
+-    move     OFF,   OFFSET
+-#endif
+-
+-    /* if (!(M >> 4)) goto L_M8 */
+-    srai.d   I,     M,     4     /* I = bm >> 4 */
+-    beq      ZERO,  I,     .L_M8
+-
+-.L_I1: /* I-- */
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    move     B0,    B
+-#else
+-    slli.d   T0,    OFF,  0x07
+-    add.d    A0,    A0,   T0
+-    slli.d   T0,    OFF,  0x05
+-    add.d    B0,    B,    T0
+-#endif
+-
+-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+-    sub.d    L,     K,    OFF
+-#elif defined(LEFT)
+-    /* number of values in A */
+-    addi.d   L,     OFF,  16
+-#else
+-    /* number of values in B */
+-    addi.d   L,     OFF,  4
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
+-    /* Calculate the first set of D0~D15,
+-     * avoidig set 0 operation
+-     * Load 16 * 64 from A0
+-     * U0 = {a3,  a2,  a1,  a0}
+-     * U1 = {a7,  a6,  a5,  a4}
+-     * U2 = {a11, a10, a9,  a8}
+-     * U3 = {a15, a14, a13, a12}
+-     */
++.macro KERNEL2x16x4
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d     U4, B0, 0x00
+-    preld          0,  C0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-    xvfmul.d  D1,  U1, U4
+-    preld     0,   C0, 0x40
+-    xvfmul.d  D2,  U2, U4
+-    xvfmul.d  D3,  U3, U4
+-
+-    xvldrepl.d     U4, B0, 0x08
+-    preld          0,   C1,    0x00
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-    xvfmul.d  D5,  U1, U4
+-    preld     0,   C1,    0x40
+-    xvfmul.d  D6,  U2, U4
+-    xvfmul.d  D7,  U3, U4
+-
+-    xvldrepl.d     U4, B0, 0x10
+-    preld          0,   C2,    0x00
+-    /* line 3 */
+-    xvfmul.d  D8,  U0, U4
+-    xvfmul.d  D9,  U1, U4
+-    preld     0,   C2,    0x40
+-    xvfmul.d  D10, U2, U4
+-    xvfmul.d  D11, U3, U4
+-
+-    xvldrepl.d     U4, B0, 0x18
+-    preld          0,   C3,    0x00
+-    /* line 4 */
+-    xvfmul.d  D12, U0, U4
+-    xvfmul.d  D13, U1, U4
+-    preld     0,   C3,    0x40
+-    xvfmul.d  D14, U2, U4
+-    xvfmul.d  D15, U3, U4
+-
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x80
+-    addi.d    B0,  B0, 0x20
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_L7 */
+-    beq       ZERO,TL, .L_L7
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
+ 
+-    /* Calculate 8 sets of D0~D15 */
+-.L_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
+     xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
+     xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
+     preld      0,   B0, B_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D10, U10, U14, D10
++    xvfmadd.d  D11, U11, U14, D11
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+     preld      0,   A0, A_PRE
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+     preld      0,   A0, A_PRE + 0x40
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D14, U10, U15, D14
++    xvfmadd.d  D15, U11, U15, D15
+ 
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-2***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    /* Cumulative D0~D15 */
+-    xvldrepl.d U4,  B0, 0x00
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
++
++    xvld     U9,   A0,    0x20
+     xvfmadd.d  D2,  U2, U4, D2
+     xvfmadd.d  D3,  U3, U4, D3
++
++    xvld     U10,   A0,    0x40
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvld     U11,   A0,    0x60
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvldrepl.d U12,  B0, 0x00
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
+     preld      0,   B0, B_PRE
++    xvldrepl.d U13,  B0, 0x08
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+     preld      0,   A0, A_PRE
++    xvldrepl.d U14,  B0, 0x10
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+     preld      0,   A0, A_PRE + 0x40
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
+ 
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x20
++.endm
+ 
+-           /***8-3***/
+-    /* Load 16 * 64 from A0 */
++.macro KERNEL2x16x4_END
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
+     xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
+     xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
+     preld      0,   B0, B_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D10, U10, U14, D10
++    xvfmadd.d  D11, U11, U14, D11
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+     preld      0,   A0, A_PRE
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+     preld      0,   A0, A_PRE + 0x40
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D14, U10, U15, D14
++    xvfmadd.d  D15, U11, U15, D15
+ 
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-4***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    /* Cumulative D0~D15 */
+-    xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
++
+     xvfmadd.d  D2,  U2, U4, D2
+     xvfmadd.d  D3,  U3, U4, D3
++
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
+     preld      0,   B0, B_PRE
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+     preld      0,   A0, A_PRE
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+     preld      0,   A0, A_PRE + 0x40
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++.macro KERNEL8x16x4
++.rept 4
++    KERNEL2x16x4
++.endr
++.endm
+ 
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x16x4_END
++.rept 3
++    KERNEL2x16x4
++.endr
++    KERNEL2x16x4_END
++.endm
+ 
+-           /***8-5***/
+-    /* Load 16 * 64 from A0 */
++.macro KERNEL2x8x4
+     xvld     U0,   A0,    0x00
+     xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-    preld      0,   B0, B_PRE
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-    preld      0,   A0, A_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+-    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
+ 
+-    addi.d     A0,  A0, 0x80
++    addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-6***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
+ 
+-    /* Cumulative D0~D15 */
+-    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U12, B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-    preld      0,   B0, B_PRE
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-    preld      0,   A0, A_PRE
++    xvldrepl.d U13, B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+-    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U14, B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
+ 
+-    addi.d     A0,  A0, 0x80
++    addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x20
++.endm
+ 
+-           /***8-7***/
+-    /* Load 16 * 64 from A0 */
++.macro KERNEL2x8x4_END
+     xvld     U0,   A0,    0x00
+     xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x20
++
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-    preld      0,   B0, B_PRE
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-    preld      0,   A0, A_PRE
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+-    preld      0,   A0, A_PRE + 0x40
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++.endm
+ 
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x8x4
++.rept 4
++    KERNEL2x8x4
++.endr
++.endm
+ 
+-           /***8-8***/
+-    /* Load 16 * 64 from A0 */
++.macro KERNEL8x8x4_END
++.rept 3
++    KERNEL2x8x4
++.endr
++    KERNEL2x8x4_END
++.endm
++
++.macro KERNEL2x4x4
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-    preld      0,   B0, B_PRE
++    xvfmadd.d  D0,  U8, U12, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-    preld      0,   A0, A_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
+-    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
+ 
+-    addi.d     A0,  A0, 0x80
++    addi.d     A0,  A0, 0x20
+     addi.d     B0,  B0, 0x20
+ 
+-    addi.d    TL,  TL, -1 /* TL-- */
+-    blt       ZERO,TL, .L_TL1
++    xvld     U8,   A0,    0x00
+ 
+-   /* Maybe we need calculate the last
+-    * 7 sets of D0~D15?
+-    */
+-.L_L7:
+-    /* if (!(L & 7)) goto L_L0 */
+-    andi      TL,  L,   7
+-    beq       TL,  ZERO,.L_L0
++    xvldrepl.d U12, B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
+ 
+-.L_L71:
+-    /* Load 16 * 64 from A0 */
++    xvldrepl.d U13, B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++
++    xvldrepl.d U14, B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x4x4_END
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+ 
+-    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
++    xvfmadd.d  D0,  U8, U12, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
+-    xvfmadd.d  D10, U2, U4, D10
+-    xvfmadd.d  D11, U3, U4, D11
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
+-    xvfmadd.d  D14, U2, U4, D14
+-    xvfmadd.d  D15, U3, U4, D15
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
+ 
+-    /* Add stride for A0, B0 */
+-    addi.d     A0,  A0, 0x80
++    addi.d     A0,  A0, 0x20
+     addi.d     B0,  B0, 0x20
+ 
+-    addi.d     TL,  TL, -1
+-    blt        ZERO,TL, .L_L71
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D12, U0, U7, D12
++.endm
+ 
+-.L_L0:
+-#if defined(TRMMKERNEL)
+-    xvfmul.d  D0,   D0,  VALPHA
+-    xvfmul.d  D1,   D1,  VALPHA
+-    xvfmul.d  D2,   D2,  VALPHA
+-    xvfmul.d  D3,   D3,  VALPHA
+-    xvfmul.d  D4,   D4,  VALPHA
+-    xvfmul.d  D5,   D5,  VALPHA
+-    xvfmul.d  D6,   D6,  VALPHA
+-    xvfmul.d  D7,   D7,  VALPHA
+-    xvfmul.d  D8,   D8,  VALPHA
+-    xvfmul.d  D9,   D9,  VALPHA
+-    xvfmul.d  D10,  D10, VALPHA
+-    xvfmul.d  D11,  D11, VALPHA
+-    xvfmul.d  D12,  D12, VALPHA
+-    xvfmul.d  D13,  D13, VALPHA
+-    xvfmul.d  D14,  D14, VALPHA
+-    xvfmul.d  D15,  D15, VALPHA
+-#else
+-    /* Load C0  */
+-    xvld      U0,  C0,  0x00
+-    xvld      U1,  C0,  0x20
+-    xvld      U2,  C0,  0x40
+-    xvld      U3,  C0,  0x60
+-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+-    xvfmadd.d D1,  D1,  VALPHA,  U1
+-    xvfmadd.d D2,  D2,  VALPHA,  U2
+-    xvfmadd.d D3,  D3,  VALPHA,  U3
++.macro KERNEL8x4x4
++.rept 4
++    KERNEL2x4x4
++.endr
++.endm
+ 
+-    /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvld      U1,  C1,  0x20
+-    xvld      U2,  C1,  0x40
+-    xvld      U3,  C1,  0x60
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
+-    xvfmadd.d D5,  D5,  VALPHA,  U1
+-    xvfmadd.d D6,  D6,  VALPHA,  U2
+-    xvfmadd.d D7,  D7,  VALPHA,  U3
++.macro KERNEL8x4x4_END
++.rept 3
++    KERNEL2x4x4
++.endr
++    KERNEL2x4x4_END
++.endm
+ 
+-    /* Load C2  */
+-    xvld      U0,  C2,  0x00
+-    xvld      U1,  C2,  0x20
+-    xvld      U2,  C2,  0x40
+-    xvld      U3,  C2,  0x60
+-    xvfmadd.d D8,  D8,  VALPHA,  U0
+-    xvfmadd.d D9,  D9,  VALPHA,  U1
+-    xvfmadd.d D10, D10, VALPHA,  U2
+-    xvfmadd.d D11, D11, VALPHA,  U3
++.macro KERNEL2x2x4
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
+ 
+-    /* Load C3  */
+-    xvld      U0,  C3,  0x00
+-    xvld      U1,  C3,  0x20
+-    xvld      U2,  C3,  0x40
+-    xvld      U3,  C3,  0x60
+-    xvfmadd.d D12, D12, VALPHA,  U0
+-    xvfmadd.d D13, D13, VALPHA,  U1
+-    xvfmadd.d D14, D14, VALPHA,  U2
+-    xvfmadd.d D15, D15, VALPHA,  U3
+-#endif // #if defined(TRMMKERNEL)
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvfmadd.d   D1,     U9,     U12,    D1
+ 
+-    /* Store C0 */
+-    xvst      D0,  C0,  0x00
+-    xvst      D1,  C0,  0x20
+-    xvst      D2,  C0,  0x40
+-    xvst      D3,  C0,  0x60
+-    /* Store C1 */
+-    xvst      D4,  C1,  0x00
+-    xvst      D5,  C1,  0x20
+-    xvst      D6,  C1,  0x40
+-    xvst      D7,  C1,  0x60
+-    /* Store C2 */
+-    xvst      D8,  C2,  0x00
+-    xvst      D9,  C2,  0x20
+-    xvst      D10, C2,  0x40
+-    xvst      D11, C2,  0x60
+-    /* Store C3 */
+-    xvst      D12, C3,  0x00
+-    xvst      D13, C3,  0x20
+-    xvst      D14, C3,  0x40
+-    xvst      D15, C3,  0x60
++    xvld    U4,  B0, 0x00
++    addi.d  A0,  A0, 0x10
++    addi.d  B0,  B0, 0x20
+ 
+-    /* Add stride for C */
+-    addi.d    C0,  C0,  0x80
+-    addi.d    C1,  C1,  0x80
+-    addi.d    C2,  C2,  0x80
+-    addi.d    C3,  C3,  0x80
++    xvldrepl.d     U8,   A0,    0x00
++    xvldrepl.d     U9,   A0,    0x08
+ 
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    sub.d     L,   K,   OFF
+-#ifdef LEFT
+-    /* number of values  in A */
+-    addi.d    L,   L,   -16
+-#else
+-    /* number of values in B */
+-    addi.d    L,   L,   -4
+-#endif
+-    slli.d    T0,  L,  0x07
+-    add.d     A0,  A0, T0
+-    slli.d    T0,  L,  0x05
+-    add.d     B0,  B0, T0
+-#endif
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
+ 
+-#ifdef LEFT
+-    addi.d    OFF, OFF, 0x10
+-#endif
+-#endif   // #if defined(TRMMKERNEL)
++    xvld       U12, B0, 0x00
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x20
++.endm
+ 
+-    addi.d    I,   I,   -1  /* I-- */
+-    blt       ZERO,I,   .L_I1
++.macro KERNEL2x2x4_END
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
+ 
+-.L_M8:
+-    /* We have done M & 16, considering M=8/4/2/1 */
+-    andi      I,   M,   15
+-    beq       ZERO,I,   .L_M0
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvfmadd.d   D1,     U9,     U12,    D1
+ 
+-    andi      I,   M,   8
+-    beq       ZERO,I,   .L_M4
++    xvld    U4,  B0, 0x00
++    addi.d  A0,  A0, 0x10
++    addi.d  B0,  B0, 0x20
+ 
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    move     B0,    B
+-#else
+-    slli.d   T0,    OFF,  0x06
+-    add.d    A0,    A0,   T0
+-    slli.d   T0,    OFF,  0x05
+-    add.d    B0,    B,    T0
+-#endif
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
++.endm
+ 
+-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+-    sub.d    L,     K,    OFF
+-#elif defined(LEFT)
+-    /* number of values in A */
+-    addi.d   L,     OFF,  8
+-#else
+-    /* number of values in B */
+-    addi.d   L,     OFF,  4
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif  // #if defined(TRMMKERNEL)
++.macro KERNEL8x2x4
++.rept 4
++    KERNEL2x2x4
++.endr
++.endm
+ 
+-    /* Load 8 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++.macro KERNEL8x2x4_END
++.rept 3
++    KERNEL2x2x4
++.endr
++    KERNEL2x2x4_END
++.endm
+ 
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-    xvfmul.d  D1,  U1, U4
++.macro KERNEL2x1x4
++    xvldrepl.d  U0,     A0,     0x00
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvld        U4,     B0,     0x00
+ 
+-    xvldrepl.d     U4, B0, 0x08
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-    xvfmul.d  D5,  U1, U4
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
+ 
+-    xvldrepl.d     U4, B0, 0x10
+-    /* line 3 */
+-    xvfmul.d  D8,  U0, U4
+-    xvfmul.d  D9,  U1, U4
++    xvldrepl.d  U8,     A0,     0x00
++    xvfmadd.d   D0,     U0,     U4,     D0
++    xvld        U12,    B0,     0x00
+ 
+-    xvldrepl.d     U4, B0, 0x18
+-    /* line 4 */
+-    xvfmul.d  D12, U0, U4
+-    xvfmul.d  D13, U1, U4
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
++.endm
+ 
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x40
+-    addi.d    B0,  B0, 0x20
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_M8_L7 */
+-    beq       ZERO,TL, .L_M8_L7
++.macro KERNEL2x1x4_END
++    xvldrepl.d  U0,     A0,     0x00
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvld        U4,     B0,     0x00
+ 
+-.L_M8_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
++
++    xvfmadd.d   D0,     U0,     U4,     D0
++.endm
++
++.macro KERNEL8x1x4
++.rept 4
++    KERNEL2x1x4
++.endr
++.endm
++
++.macro KERNEL8x1x4_END
++.rept 3
++    KERNEL2x1x4
++.endr
++    KERNEL2x1x4_END
++.endm
++
++.macro KERNEL2x16x2
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvld     U10,   A0,    0x40
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++    xvld     U11,   A0,    0x60
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++.endm
+ 
+-           /***8-2***/
++.macro KERNEL2x16x2_END
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++.endm
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x16x2
++.rept 4
++    KERNEL2x16x2
++.endr
++.endm
++
++.macro KERNEL8x16x2_END
++.rept 3
++    KERNEL2x16x2
++.endr
++    KERNEL2x16x2_END
++.endm
+ 
+-           /***8-3***/
++.macro KERNEL2x8x2
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
+ 
+     addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    addi.d     B0,  B0, 0x10
++.endm
+ 
+-           /***8-4***/
++.macro KERNEL2x8x2_END
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x10
++
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++.macro KERNEL8x8x2
++.rept 4
++    KERNEL2x8x2
++.endr
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++.macro KERNEL8x8x2_END
++.rept 3
++    KERNEL2x8x2
++ .endr
++    KERNEL2x8x2_END
++.endm
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-5***/
++.macro KERNEL2x4x2
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvldrepl.d U5,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
++.endm
+ 
+-           /***8-6***/
++.macro KERNEL2x4x2_END
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvldrepl.d U5,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++.macro KERNEL8x4x2
++.rept 4
++    KERNEL2x4x2
++.endr
++.endm
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x4x2_END
++.rept 3
++    KERNEL2x4x2
++.endr
++    KERNEL2x4x2_END
++.endm
+ 
+-           /***8-7***/
++.macro KERNEL2x2x2
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++.macro KERNEL2x2x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
+ 
+-           /***8-8***/
++.macro KERNEL8x2x2
++.rept 4
++    KERNEL2x2x2
++.endr
++.endm
++
++.macro KERNEL8x2x2_END
++.rept 3
++    KERNEL2x2x2
++.endr
++    KERNEL2x2x2_END
++.endm
++
++.macro KERNEL2x1x2
+     xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+     xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++.macro KERNEL2x1x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
+ 
+-    addi.d    TL,  TL, -1 /* TL-- */
+-    blt       ZERO,TL, .L_M8_TL1
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
+ 
+-.L_M8_L7:
+-    /* if (!(L & 7)) goto L_M8_L0 */
+-    andi      TL,  L,   7
+-    beq       TL,  ZERO,.L_M8_L0
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
+ 
+-.L_M8_L71:
++.macro KERNEL8x1x2
++.rept 4
++    KERNEL2x1x2
++.endr
++.endm
++
++.macro KERNEL8x1x2_END
++.rept 3
++    KERNEL2x1x2
++.endr
++    KERNEL2x1x2_END
++.endm
++
++.macro KERNEL2x16x1
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
+     xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
+ 
+     xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-    xvfmadd.d  D9,  U1, U4, D9
++    xvld     U10,   A0,    0x40
++    xvld     U11,   A0,    0x60
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-    xvfmadd.d  D13, U1, U4, D13
++    xvldrepl.d U12,  B0, 0x00
+ 
+-    /* Add stride for A0, B0 */
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x20
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
++.endm
+ 
+-    addi.d     TL,  TL, -1
+-    blt        ZERO,TL, .L_M8_L71
++.macro KERNEL2x16x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
+ 
+-.L_M8_L0:
+-#if defined(TRMMKERNEL)
+-    xvfmul.d  D0,   D0,  VALPHA
+-    xvfmul.d  D1,   D1,  VALPHA
+-    xvfmul.d  D4,   D4,  VALPHA
+-    xvfmul.d  D5,   D5,  VALPHA
+-    xvfmul.d  D8,   D8,  VALPHA
+-    xvfmul.d  D9,   D9,  VALPHA
+-    xvfmul.d  D12,  D12, VALPHA
+-    xvfmul.d  D13,  D13, VALPHA
+-#else
+-    /* Load C0  */
+-    xvld      U0,  C0,  0x00
+-    xvld      U1,  C0,  0x20
+-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+-    xvfmadd.d D1,  D1,  VALPHA,  U1
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
+ 
+-    /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvld      U1,  C1,  0x20
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
+-    xvfmadd.d D5,  D5,  VALPHA,  U1
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
+ 
+-    /* Load C2  */
+-    xvld      U0,  C2,  0x00
+-    xvld      U1,  C2,  0x20
+-    xvfmadd.d D8,  D8,  VALPHA,  U0
+-    xvfmadd.d D9,  D9,  VALPHA,  U1
++    xvldrepl.d U4,  B0, 0x00
+ 
+-    /* Load C3  */
+-    xvld      U0,  C3,  0x00
+-    xvld      U1,  C3,  0x20
+-    xvfmadd.d D12, D12, VALPHA,  U0
+-    xvfmadd.d D13, D13, VALPHA,  U1
+-#endif   // #if defined(TRMMKERNEL)
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
+ 
+-    /* Store C0 */
+-    xvst      D0,  C0,  0x00
+-    xvst      D1,  C0,  0x20
+-    /* Store C1 */
+-    xvst      D4,  C1,  0x00
+-    xvst      D5,  C1,  0x20
+-    /* Store C2 */
+-    xvst      D8,  C2,  0x00
+-    xvst      D9,  C2,  0x20
+-    /* Store C3 */
+-    xvst      D12, C3,  0x00
+-    xvst      D13, C3,  0x20
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
+ 
+-    /* Add stride for C */
+-    addi.d    C0,  C0,  0x40
+-    addi.d    C1,  C1,  0x40
+-    addi.d    C2,  C2,  0x40
+-    addi.d    C3,  C3,  0x40
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++.endm
+ 
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    sub.d     L,   K,   OFF
+-#ifdef LEFT
+-    /* number of values in A */
+-    addi.d    L,   L,   -8
+-#else
+-    /* number of values in B */
+-    addi.d    L,   L,   -4
+-#endif
+-    slli.d    T0,  L,  0x06
+-    add.d     A0,  A0, T0
+-    slli.d    T0,  L,  0x05
+-    add.d     B0,  B0, T0
+-#endif
++.macro KERNEL8x16x1
++.rept 4
++    KERNEL2x16x1
++.endr
++.endm
+ 
+-#ifdef LEFT
+-    /* number of values in A */
+-    addi.d    OFF,   OFF,  0x08
+-#endif
+-#endif   // #if defined(TRMMKERNEL)
++.macro KERNEL8x16x1_END
++.rept 3
++    KERNEL2x16x1
++.endr
++    KERNEL2x16x1_END
++.endm
+ 
+-/********LOOP (if(N >> 2 ) && (M & 8)) End************/
++.macro KERNEL2x8x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++    xvld     U1,   A0,    0x20
++    xvldrepl.d U4,  B0, 0x00
+ 
+-.L_M4:
+-    andi      I,   M,   4
+-    beq       ZERO,I,   .L_M2
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
+ 
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    move     B0,    B
+-#else
+-    slli.d   T0,    OFF,  0x05
+-    add.d    A0,    A0,   T0
+-    add.d    B0,    B,    T0
+-#endif
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvld     U9,   A0,    0x20
++    xvldrepl.d U12,  B0, 0x00
+ 
+-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+-    sub.d    L,     K,    OFF
+-#elif defined(LEFT)
+-    /* number of values in A */
+-    addi.d   L,     OFF,  4
+-#else
+-    /* number of values in B */
+-    addi.d   L,     OFF,  4
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
++.endm
+ 
+-    /* Load 4 * 64 from A0 */
++.macro KERNEL2x8x1_END
+     xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++    xvld     U1,   A0,    0x20
++    xvldrepl.d U4,  B0, 0x00
+ 
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-
+-    xvldrepl.d     U4, B0, 0x08
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d     U4, B0, 0x10
+-    /* line 3 */
+-    xvfmul.d  D8,  U0, U4
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++.endm
+ 
+-    xvldrepl.d     U4, B0, 0x18
+-    /* line 4 */
+-    xvfmul.d  D12, U0, U4
++.macro KERNEL8x8x1
++.rept 4
++    KERNEL2x8x1
++.endr
++.endm
+ 
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x20
+-    addi.d    B0,  B0, 0x20
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_M4_L7 */
+-    beq       ZERO,TL, .L_M4_L7
++.macro KERNEL8x8x1_END
++.rept 3
++    KERNEL2x8x1
++.endr
++    KERNEL2x8x1_END
++.endm
+ 
+-.L_M4_TL1: /* TL-- */
+-           /***8-1***/
++.macro KERNEL2x4x1
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
+ 
+     addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++    addi.d     B0,  B0, 0x08
++.endm
+ 
+-           /***8-2***/
++.macro KERNEL2x4x1_END
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvfmadd.d  D0,  U0, U4, D0
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++.macro KERNEL8x4x1
++.rept 4
++    KERNEL2x4x1
++.endr
++.endm
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x4x1_END
++.rept 3
++    KERNEL2x4x1
++.endr
++    KERNEL2x4x1_END
++.endm
+ 
+-           /***8-3***/
++.macro KERNEL2x2x1
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
++.endm
+ 
+-           /***8-4***/
++.macro KERNEL2x2x1_END
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvfmadd.d  D0,  U0, U4, D0
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++.macro KERNEL8x2x1
++.rept 4
++    KERNEL2x2x1
++.endr
++.endm
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++.macro KERNEL8x2x1_END
++.rept 3
++    KERNEL2x2x1
++.endr
++    KERNEL2x2x1_END
++.endm
+ 
+-           /***8-5***/
++.macro KERNEL2x1x1
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++.endm
+ 
+-           /***8-6***/
++.macro KERNEL2x1x1_END
+     xvld     U0,   A0,    0x00
+-
++    xvfmadd.d  D0,  U8, U12, D0
+     xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++
+     xvfmadd.d  D0,  U0, U4, D0
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++.macro KERNEL8x1x1
++.rept 4
++    KERNEL2x1x1
++.endr
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++.macro KERNEL8x1x1_END
++.rept 3
++    KERNEL2x1x1
++.endr
++    KERNEL2x1x1_END
++.endm
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++    PROLOGUE
+ 
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    addi.d   $sp,   $sp,   -120
++    /* Store regs */
++    SDARG    $r23,  $sp,   0
++    SDARG    $r24,  $sp,   8
++    SDARG    $r25,  $sp,   16
++    SDARG    $r26,  $sp,   24
++    SDARG    $r27,  $sp,   32
++    ST       $f23,  $sp,   40
++    ST       $f24,  $sp,   48
++    ST       $f25,  $sp,   56
++    ST       $f26,  $sp,   64
++    ST       $f27,  $sp,   72
++    ST       $f28,  $sp,   80
++    ST       $f29,  $sp,   88
++    ST       $f30,  $sp,   96
++    ST       $f31,  $sp,   104
++    ST       ALPHA, $sp,   112
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++#if defined (TRMMKERNEL) && !defined(LEFT)
++    sub.d   OFF,   ZERO,  OFFSET
++#else
++    xor     OFF,   OFF,   OFF
++#endif
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    /* if (!(N >> 2)) goto L_N3 */
++    srai.d   J,     N,     2     /* J = bn >> 2 */
++    andi     N,     N,     0x03
++    xvldrepl.d  VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */
++    beq      ZERO,  J,     .L_N3
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++.L_J1: /* J-- && This loop include Condition 1 */
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
++*                                                   dgemm_core_16x4                                */
++    move     C0,    C
++    move     A0,    A
++    slli.d   T0,    LDC,   3
++    add.d    C1,    C0,    T0
++    addi.d   J,     J,     -1   /* J-- */
++    add.d    C2,    C1,    T0
++    add.d    C3,    C2,    T0
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x20
++#if defined(TRMMKERNEL) && defined(LEFT)
++    move     OFF,   OFFSET
++#endif
++
++    /* if (!(M >> 4)) goto L_M8 */
++    srai.d   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_M8
+ 
+-           /***8-8***/
++.L_I1: /* I-- */
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    slli.d   T0,    OFF,  0x07
++    add.d    A0,    A0,   T0
++    slli.d   T0,    OFF,  0x05
++    add.d    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    sub.d    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    addi.d   L,     OFF,  16
++#else
++    /* number of values in B */
++    addi.d   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    /* Calculate the first set of D0~D15,
++     * avoidig set 0 operation
++     * Load 16 * 64 from A0
++     * U0 = {a3,  a2,  a1,  a0}
++     * U1 = {a7,  a6,  a5,  a4}
++     * U2 = {a11, a10, a9,  a8}
++     * U3 = {a15, a14, a13, a12}
++     */
+     xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d     U4, B0, 0x00
++    preld          0,  C0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++    preld     0,   C0, 0x40
++    xvfmul.d  D2,  U2, U4
++    xvfmul.d  D3,  U3, U4
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d     U5, B0, 0x08
++    preld          0,   C1,    0x00
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++    preld     0,   C1,    0x40
++    xvfmul.d  D6,  U2, U5
++    xvfmul.d  D7,  U3, U5
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvldrepl.d     U6, B0, 0x10
++    preld          0,   C2,    0x00
++    /* line 3 */
++    xvfmul.d  D8,  U0, U6
++    xvfmul.d  D9,  U1, U6
++    preld     0,   C2,    0x40
++    xvfmul.d  D10, U2, U6
++    xvfmul.d  D11, U3, U6
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvldrepl.d     U7, B0, 0x18
++    preld          0,   C3,    0x00
++    /* line 4 */
++    xvfmul.d  D12, U0, U7
++    xvfmul.d  D13, U1, U7
++    preld     0,   C3,    0x40
++    xvfmul.d  D14, U2, U7
++    xvfmul.d  D15, U3, U7
+ 
+-    addi.d     A0,  A0, 0x20
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x80
++    addi.d    B0,  B0, 0x20
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_L7 */
++    beq       ZERO,TL, .L_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
++
++    addi.d    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x20
+ 
++    beq    ZERO,    TL,  .L_TL1_END
++.L_TL1: /* TL-- */
++    KERNEL8x16x4
+     addi.d    TL,  TL, -1 /* TL-- */
+-    blt       ZERO,TL, .L_M4_TL1
++    blt       ZERO,TL, .L_TL1
+ 
+-.L_M4_L7:
+-    /* if (!(L & 7)) goto L_M4_L0 */
++.L_TL1_END:
++    KERNEL8x16x4_END
++
++   /* Maybe we need calculate the last
++    * 7 sets of D0~D15?
++    */
++.L_L7:
++    /* if (!(L & 7)) goto L_L0 */
+     andi      TL,  L,   7
+-    beq       TL,  ZERO,.L_M4_L0
++    beq       TL,  ZERO,.L_L0
+ 
+-.L_M4_L71:
++.L_L71:
++    /* Load 16 * 64 from A0 */
+     xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
+ 
++    /* Cumulative D0~D15 */
+     xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
+ 
+     /* Add stride for A0, B0 */
+-    addi.d     A0,  A0, 0x20
++    addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x20
+ 
+     addi.d     TL,  TL, -1
+-    blt        ZERO,TL, .L_M4_L71
++    blt        ZERO,TL, .L_L71
+ 
+-.L_M4_L0:
++.L_L0:
++    xvldrepl.d  VALPHA, $sp, 112
+ #if defined(TRMMKERNEL)
+     xvfmul.d  D0,   D0,  VALPHA
++    xvfmul.d  D1,   D1,  VALPHA
++    xvfmul.d  D2,   D2,  VALPHA
++    xvfmul.d  D3,   D3,  VALPHA
+     xvfmul.d  D4,   D4,  VALPHA
++    xvfmul.d  D5,   D5,  VALPHA
++    xvfmul.d  D6,   D6,  VALPHA
++    xvfmul.d  D7,   D7,  VALPHA
+     xvfmul.d  D8,   D8,  VALPHA
++    xvfmul.d  D9,   D9,  VALPHA
++    xvfmul.d  D10,  D10, VALPHA
++    xvfmul.d  D11,  D11, VALPHA
+     xvfmul.d  D12,  D12, VALPHA
++    xvfmul.d  D13,  D13, VALPHA
++    xvfmul.d  D14,  D14, VALPHA
++    xvfmul.d  D15,  D15, VALPHA
+ #else
+     /* Load C0  */
+     xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
+     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
++    xvfmadd.d D1,  D1,  VALPHA,  U1
++    xvfmadd.d D2,  D2,  VALPHA,  U2
++    xvfmadd.d D3,  D3,  VALPHA,  U3
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++    xvfmadd.d D4,  D4,  VALPHA,  U4
++    xvfmadd.d D5,  D5,  VALPHA,  U5
++    xvfmadd.d D6,  D6,  VALPHA,  U6
++    xvfmadd.d D7,  D7,  VALPHA,  U7
+ 
+     /* Load C2  */
+-    xvld      U0,  C2,  0x00
+-    xvfmadd.d D8,  D8,  VALPHA,  U0
++    xvld      U8,  C2,  0x00
++    xvld      U9,  C2,  0x20
++    xvld      U10, C2,  0x40
++    xvld      U11, C2,  0x60
++    xvfmadd.d D8,  D8,  VALPHA,  U8
++    xvfmadd.d D9,  D9,  VALPHA,  U9
++    xvfmadd.d D10, D10, VALPHA,  U10
++    xvfmadd.d D11, D11, VALPHA,  U11
+ 
+     /* Load C3  */
+     xvld      U0,  C3,  0x00
++    xvld      U1,  C3,  0x20
++    xvld      U2,  C3,  0x40
++    xvld      U3,  C3,  0x60
+     xvfmadd.d D12, D12, VALPHA,  U0
+-#endif   // #if defined(TRMMKERNEL)
++    xvfmadd.d D13, D13, VALPHA,  U1
++    xvfmadd.d D14, D14, VALPHA,  U2
++    xvfmadd.d D15, D15, VALPHA,  U3
++#endif // #if defined(TRMMKERNEL)
+ 
+     /* Store C0 */
+     xvst      D0,  C0,  0x00
++    xvst      D1,  C0,  0x20
++    xvst      D2,  C0,  0x40
++    xvst      D3,  C0,  0x60
+     /* Store C1 */
+     xvst      D4,  C1,  0x00
++    xvst      D5,  C1,  0x20
++    xvst      D6,  C1,  0x40
++    xvst      D7,  C1,  0x60
+     /* Store C2 */
+     xvst      D8,  C2,  0x00
++    xvst      D9,  C2,  0x20
++    xvst      D10, C2,  0x40
++    xvst      D11, C2,  0x60
+     /* Store C3 */
+     xvst      D12, C3,  0x00
++    xvst      D13, C3,  0x20
++    xvst      D14, C3,  0x40
++    xvst      D15, C3,  0x60
+ 
+     /* Add stride for C */
+-    addi.d    C0,  C0,  0x20
+-    addi.d    C1,  C1,  0x20
+-    addi.d    C2,  C2,  0x20
+-    addi.d    C3,  C3,  0x20
++    addi.d    C0,  C0,  0x80
++    addi.d    C1,  C1,  0x80
++    addi.d    C2,  C2,  0x80
++    addi.d    C3,  C3,  0x80
+ 
+ #if defined(TRMMKERNEL)
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+     sub.d     L,   K,   OFF
+ #ifdef LEFT
+-    /* number of values in A */
+-    addi.d    L,   L,   -4
++    /* number of values  in A */
++    addi.d    L,   L,   -16
+ #else
+     /* number of values in B */
+     addi.d    L,   L,   -4
+ #endif
+-    slli.d    T0,  L,  0x05
++    slli.d    T0,  L,  0x07
+     add.d     A0,  A0, T0
++    slli.d    T0,  L,  0x05
+     add.d     B0,  B0, T0
+ #endif
+ 
+ #ifdef LEFT
+-    /* number of values in A */
+-    addi.d    OFF,   OFF,  0x04
++    addi.d    OFF, OFF, 0x10
+ #endif
+ #endif   // #if defined(TRMMKERNEL)
+ 
+-/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
++    addi.d    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_I1
+ 
+-.L_M2:
+-    andi      I,   M,   2
+-    beq       ZERO,I,   .L_M1
++.L_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_M0
++
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_M4
+ 
+ #if defined(TRMMKERNEL)
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+     move     B0,    B
+ #else
+-    slli.d   T0,    OFF,  0x04
++    slli.d   T0,    OFF,  0x06
+     add.d    A0,    A0,   T0
+     slli.d   T0,    OFF,  0x05
+     add.d    B0,    B,    T0
+@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     sub.d    L,     K,    OFF
+ #elif defined(LEFT)
+     /* number of values in A */
+-    addi.d   L,     OFF,  2
++    addi.d   L,     OFF,  8
+ #else
+     /* number of values in B */
+     addi.d   L,     OFF,  4
+@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #else   // #if !defined(TRMMKERNEL)
+     move     B0,    B
+     move     L,     K /* L = bk */
+-#endif
++#endif  // #if defined(TRMMKERNEL)
+ 
+-    /* Load 2 * 64 from A0 */
++    /* Load 8 * 64 from A0 */
+     xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
+ 
+     xvldrepl.d     U4, B0, 0x00
+     /* line 1 */
+     xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
+ 
+-    xvldrepl.d     U4, B0, 0x08
++    xvldrepl.d     U5, B0, 0x08
+     /* line 2 */
+-    xvfmul.d  D4,  U0, U4
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
+ 
+-    xvldrepl.d     U4, B0, 0x10
++    xvldrepl.d     U6, B0, 0x10
+     /* line 3 */
+-    xvfmul.d  D8,  U0, U4
++    xvfmul.d  D8,  U0, U6
++    xvfmul.d  D9,  U1, U6
+ 
+-    xvldrepl.d     U4, B0, 0x18
++    xvldrepl.d     U7, B0, 0x18
+     /* line 4 */
+-    xvfmul.d  D12, U0, U4
++    xvfmul.d  D12, U0, U7
++    xvfmul.d  D13, U1, U7
+ 
+     /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x10
++    addi.d    A0,  A0, 0x40
+     addi.d    B0,  B0, 0x20
+     /* Reduce L */
+     addi.d    L,   L,  -1
+     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_M2_L7 */
+-    beq       ZERO,TL, .L_M2_L7
+-
+-.L_M2_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 2 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
+-
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    /* if (TL < 1) goto L_M8_L7 */
++    beq       ZERO,TL, .L_M8_L7
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    addi.d    TL,  TL,  -1
+ 
+-    addi.d     A0,  A0, 0x10
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    beq    ZERO,    TL,  .L_M8_TL1_END
+ 
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x20
++.L_M8_TL1: /* TL-- */
++    KERNEL8x8x4
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+-    blt       ZERO,TL, .L_M2_TL1
++    blt       ZERO,TL, .L_M8_TL1
+ 
+-.L_M2_L7:
+-    /* if (!(L & 7)) goto L_M2_L0 */
++.L_M8_TL1_END:
++    KERNEL8x8x4_END
++
++.L_M8_L7:
++    /* if (!(L & 7)) goto L_M8_L0 */
+     andi      TL,  L,   7
+-    beq       TL,  ZERO,.L_M2_L0
++    beq       TL,  ZERO,.L_M8_L0
+ 
+-.L_M2_L71:
++.L_M8_L71:
+     xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
+ 
+     xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
+ 
+     /* Add stride for A0, B0 */
+-    addi.d     A0,  A0, 0x10
++    addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x20
+ 
+     addi.d     TL,  TL, -1
+-    blt        ZERO,TL, .L_M2_L71
++    blt        ZERO,TL, .L_M8_L71
+ 
+-.L_M2_L0:
++.L_M8_L0:
++    xvldrepl.d  VALPHA, $sp, 112
+ #if defined(TRMMKERNEL)
+     xvfmul.d  D0,   D0,  VALPHA
++    xvfmul.d  D1,   D1,  VALPHA
+     xvfmul.d  D4,   D4,  VALPHA
++    xvfmul.d  D5,   D5,  VALPHA
+     xvfmul.d  D8,   D8,  VALPHA
++    xvfmul.d  D9,   D9,  VALPHA
+     xvfmul.d  D12,  D12, VALPHA
++    xvfmul.d  D13,  D13, VALPHA
+ #else
+     /* Load C0  */
+     xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
+     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
++    xvfmadd.d D1,  D1,  VALPHA,  U1
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++    xvfmadd.d D4,  D4,  VALPHA,  U2
++    xvfmadd.d D5,  D5,  VALPHA,  U3
+ 
+     /* Load C2  */
+-    xvld      U0,  C2,  0x00
+-    xvfmadd.d D8,  D8,  VALPHA,  U0
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++    xvfmadd.d D8,  D8,  VALPHA,  U4
++    xvfmadd.d D9,  D9,  VALPHA,  U5
+ 
+     /* Load C3  */
+-    xvld      U0,  C3,  0x00
+-    xvfmadd.d D12, D12, VALPHA,  U0
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++    xvfmadd.d D12, D12, VALPHA,  U6
++    xvfmadd.d D13, D13, VALPHA,  U7
+ #endif   // #if defined(TRMMKERNEL)
+ 
+-    xvstelm.d D0,  C0,  0x00,    0x00
+-    xvstelm.d D4,  C1,  0x00,    0x00
+-    xvstelm.d D8,  C2,  0x00,    0x00
+-    xvstelm.d D12, C3,  0x00,    0x00
+-    xvstelm.d D0,  C0,  0x08,    0x01
+-    xvstelm.d D4,  C1,  0x08,    0x01
+-    xvstelm.d D8,  C2,  0x08,    0x01
+-    xvstelm.d D12, C3,  0x08,    0x01
++    /* Store C0 */
++    xvst      D0,  C0,  0x00
++    xvst      D1,  C0,  0x20
++    /* Store C1 */
++    xvst      D4,  C1,  0x00
++    xvst      D5,  C1,  0x20
++    /* Store C2 */
++    xvst      D8,  C2,  0x00
++    xvst      D9,  C2,  0x20
++    /* Store C3 */
++    xvst      D12, C3,  0x00
++    xvst      D13, C3,  0x20
+ 
+     /* Add stride for C */
+-    addi.d    C0,  C0,  0x10
+-    addi.d    C1,  C1,  0x10
+-    addi.d    C2,  C2,  0x10
+-    addi.d    C3,  C3,  0x10
++    addi.d    C0,  C0,  0x40
++    addi.d    C1,  C1,  0x40
++    addi.d    C2,  C2,  0x40
++    addi.d    C3,  C3,  0x40
+ 
+ #if defined(TRMMKERNEL)
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+     sub.d     L,   K,   OFF
+ #ifdef LEFT
+     /* number of values in A */
+-    addi.d    L,   L,   -2
++    addi.d    L,   L,   -8
+ #else
+     /* number of values in B */
+     addi.d    L,   L,   -4
+ #endif
+-    slli.d    T0,  L,  0x04
++    slli.d    T0,  L,  0x06
+     add.d     A0,  A0, T0
+     slli.d    T0,  L,  0x05
+     add.d     B0,  B0, T0
+@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+ #ifdef LEFT
+     /* number of values in A */
+-    addi.d    OFF,   OFF,  0x02
++    addi.d    OFF,   OFF,  0x08
+ #endif
+ #endif   // #if defined(TRMMKERNEL)
+ 
+-/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
++/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+ 
+-.L_M1:
+-    andi      I,   M,   1
+-    beq       ZERO,I,   .L_M0
++.L_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_M2
+ 
+ #if defined(TRMMKERNEL)
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+     move     B0,    B
+ #else
+-    slli.d   T0,    OFF,  0x03
+-    add.d    A0,    A0,   T0
+     slli.d   T0,    OFF,  0x05
++    add.d    A0,    A0,   T0
+     add.d    B0,    B,    T0
+ #endif
+ 
+@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     sub.d    L,     K,    OFF
+ #elif defined(LEFT)
+     /* number of values in A */
+-    addi.d   L,     OFF,  1
++    addi.d   L,     OFF,  4
+ #else
+     /* number of values in B */
+     addi.d   L,     OFF,  4
+@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     move     L,     K /* L = bk */
+ #endif
+ 
+-    /* Load 1 * 64 from A0 */
++    /* Load 4 * 64 from A0 */
+     xvld     U0,   A0,    0x00
+ 
+     xvldrepl.d     U4, B0, 0x00
+     /* line 1 */
+     xvfmul.d  D0,  U0, U4
+ 
+-    xvldrepl.d     U4, B0, 0x08
++    xvldrepl.d     U5, B0, 0x08
+     /* line 2 */
+-    xvfmul.d  D4,  U0, U4
++    xvfmul.d  D4,  U0, U5
+ 
+-    xvldrepl.d     U4, B0, 0x10
++    xvldrepl.d     U6, B0, 0x10
+     /* line 3 */
+-    xvfmul.d  D8,  U0, U4
++    xvfmul.d  D8,  U0, U6
+ 
+-    xvldrepl.d     U4, B0, 0x18
++    xvldrepl.d     U7, B0, 0x18
+     /* line 4 */
+-    xvfmul.d  D12, U0, U4
++    xvfmul.d  D12, U0, U7
+ 
+     /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x08
++    addi.d    A0,  A0, 0x20
+     addi.d    B0,  B0, 0x20
+     /* Reduce L */
+     addi.d    L,   L,  -1
+     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_M1_L7 */
+-    beq       ZERO,TL, .L_M1_L7
++    /* if (TL < 1) goto L_M4_L7 */
++    beq       ZERO,TL, .L_M4_L7
+ 
+-.L_M1_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 1 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    addi.d    TL,  TL,  -1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x20
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    beq    ZERO,    TL,  .L_M4_TL1_END
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++.L_M4_TL1: /* TL-- */
++    KERNEL8x4x4
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x20
++    addi.d     TL,  TL, -1
++    blt        ZERO,TL, .L_M4_TL1
++
++.L_M4_TL1_END:
++    KERNEL8x4x4_END
++
++.L_M4_L7:
++    /* if (!(L & 7)) goto L_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M4_L0
+ 
+-           /***8-2***/
++.L_M4_L71:
+     xvld     U0,   A0,    0x00
+ 
+     xvldrepl.d U4,  B0, 0x00
+@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvldrepl.d U4,  B0, 0x18
+     xvfmadd.d  D12, U0, U4, D12
+ 
+-    addi.d     A0,  A0, 0x08
++    /* Add stride for A0, B0 */
++    addi.d     A0,  A0, 0x20
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
++    addi.d     TL,  TL, -1
++    blt        ZERO,TL, .L_M4_L71
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++.L_M4_L0:
++    xvldrepl.d  VALPHA, $sp, 112
++#if defined(TRMMKERNEL)
++    xvfmul.d  D0,   D0,  VALPHA
++    xvfmul.d  D4,   D4,  VALPHA
++    xvfmul.d  D8,   D8,  VALPHA
++    xvfmul.d  D12,  D12, VALPHA
++#else
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    xvfmadd.d D4,  D4,  VALPHA,  U1
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    xvfmadd.d D8,  D8,  VALPHA,  U2
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++    xvfmadd.d D12, D12, VALPHA,  U3
++#endif   // #if defined(TRMMKERNEL)
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x20
++    /* Store C0 */
++    xvst      D0,  C0,  0x00
++    /* Store C1 */
++    xvst      D4,  C1,  0x00
++    /* Store C2 */
++    xvst      D8,  C2,  0x00
++    /* Store C3 */
++    xvst      D12, C3,  0x00
+ 
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
++    /* Add stride for C */
++    addi.d    C0,  C0,  0x20
++    addi.d    C1,  C1,  0x20
++    addi.d    C2,  C2,  0x20
++    addi.d    C3,  C3,  0x20
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    sub.d     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    addi.d    L,   L,   -4
++#else
++    /* number of values in B */
++    addi.d    L,   L,   -4
++#endif
++    slli.d    T0,  L,  0x05
++    add.d     A0,  A0, T0
++    add.d     B0,  B0, T0
++#endif
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++#ifdef LEFT
++    /* number of values in A */
++    addi.d    OFF,   OFF,  0x04
++#endif
++#endif   // #if defined(TRMMKERNEL)
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++.L_M2:
++    andi      I,   M,   2
++    beq       ZERO,I,   .L_M1
+ 
+-    addi.d     A0,  A0, 0x08
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    slli.d   T0,    OFF,  0x04
++    add.d    A0,    A0,   T0
++    slli.d   T0,    OFF,  0x05
++    add.d    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    sub.d    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    addi.d   L,     OFF,  2
++#else
++    /* number of values in B */
++    addi.d   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++
++    /* Load 2 * 64 from A0 */
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
++
++    xvld    U4,     B0,     0x00
++
++    xvfmul.d    D0,     U0,     U4
++    xvfmul.d    D1,     U1,     U4
++
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x10
++    addi.d    B0,  B0, 0x20
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M2_L7 */
++    beq       ZERO,TL, .L_M2_L7
++
++    xvldrepl.d     U8,   A0,    0x00
++    xvldrepl.d     U9,   A0,    0x08
++
++    addi.d    TL,  TL,  -1
++
++    xvld       U12, B0, 0x00
++    addi.d     A0,  A0, 0x10
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
++    beq    ZERO,    TL,  .L_M2_TL1_END
++.L_M2_TL1: /* TL-- */
++    KERNEL8x2x4
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    addi.d    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_M2_TL1
++.L_M2_TL1_END:
++    KERNEL8x2x4_END
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++.L_M2_L7:
++    /* if (!(L & 7)) goto L_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M2_L0
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++.L_M2_L71:
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvld    U4,  B0, 0x00
+ 
+-    addi.d     A0,  A0, 0x08
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
++    /* Add stride for A0, B0 */
++    addi.d     A0,  A0, 0x10
+     addi.d     B0,  B0, 0x20
+ 
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
++    addi.d     TL,  TL, -1
++    blt        ZERO,TL, .L_M2_L71
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++.L_M2_L0:
++    xvldrepl.d  VALPHA, $sp, 112
++#if defined(TRMMKERNEL)
++    xvfmul.d  D0,   D0,  VALPHA
++    xvfmul.d  D1,   D1,  VALPHA
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvstelm.d D0,  C0,  0x00,    0x00
++    xvstelm.d D0,  C1,  0x00,    0x01
++    xvstelm.d D0,  C2,  0x00,    0x02
++    xvstelm.d D0,  C3,  0x00,    0x03
++    xvstelm.d D1,  C0,  0x08,    0x00
++    xvstelm.d D1,  C1,  0x08,    0x01
++    xvstelm.d D1,  C2,  0x08,    0x02
++    xvstelm.d D1,  C3,  0x08,    0x03
++#else
++    xvpackev.d  D4,     D1,     D0
++    xvpackod.d  D5,     D1,     D0
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvpermi.q   U2, U0, 0x20
++    xvpermi.q   U3, U1, 0x20
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvfmadd.d   D0, D4, VALPHA, U2
++    xvfmadd.d   D1, D5, VALPHA, U3
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x20
++    vst       $vr16,    C0,      0x00
++    vst       $vr17,    C1,      0x00
++    xvstelm.d D0,  C2,  0x00,    0x02
++    xvstelm.d D1,  C3,  0x00,    0x02
++    xvstelm.d D0,  C2,  0x08,    0x03
++    xvstelm.d D1,  C3,  0x08,    0x03
++#endif   // #if defined(TRMMKERNEL)
+ 
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    /* Add stride for C */
++    addi.d    C0,  C0,  0x10
++    addi.d    C1,  C1,  0x10
++    addi.d    C2,  C2,  0x10
++    addi.d    C3,  C3,  0x10
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    sub.d     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    addi.d    L,   L,   -2
++#else
++    /* number of values in B */
++    addi.d    L,   L,   -4
++#endif
++    slli.d    T0,  L,  0x04
++    add.d     A0,  A0, T0
++    slli.d    T0,  L,  0x05
++    add.d     B0,  B0, T0
++#endif
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++#ifdef LEFT
++    /* number of values in A */
++    addi.d    OFF,   OFF,  0x02
++#endif
++#endif   // #if defined(TRMMKERNEL)
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++.L_M1:
++    andi      I,   M,   1
++    beq       ZERO,I,   .L_M0
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x20
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    slli.d   T0,    OFF,  0x03
++    add.d    A0,    A0,   T0
++    slli.d   T0,    OFF,  0x05
++    add.d    B0,    B,    T0
++#endif
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    sub.d    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    addi.d   L,     OFF,  1
++#else
++    /* number of values in B */
++    addi.d   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d  U0,     A0,     0x00
++    xvld        U4,     B0,     0x00
++    xvfmul.d    D0,     U0,     U4
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x08
++    addi.d    B0,  B0, 0x20
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M1_L7 */
++    beq       ZERO,TL, .L_M1_L7
+ 
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
++    xvldrepl.d  U8,     A0,     0x00
+ 
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    addi.d     TL,  TL,  -1
++    xvld       U12, B0,  0x00
++    addi.d     A0,  A0,  0x08
++    addi.d     B0,  B0,  0x20
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x20
++    beq    ZERO,    TL,  .L_M1_TL1_END
++
++.L_M1_TL1: /* TL-- */
++    KERNEL8x1x4
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_M1_TL1
++.L_M1_TL1_END:
++    KERNEL8x1x4_END
+ 
+ .L_M1_L7:
+     /* if (!(L & 7)) goto L_M1_L0 */
+@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     beq       TL,  ZERO,.L_M1_L0
+ 
+ .L_M1_L71:
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    xvldrepl.d U4,  B0, 0x10
+-    xvfmadd.d  D8,  U0, U4, D8
+-
+-    xvldrepl.d U4,  B0, 0x18
+-    xvfmadd.d  D12, U0, U4, D12
++    xvldrepl.d  U0,     A0,     0x00
++    xvld        U4,     B0,     0x00
++    xvfmadd.d   D0,     U0,     U4,    D0
+ 
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x08
+@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     blt        ZERO,TL, .L_M1_L71
+ 
+ .L_M1_L0:
++    xvldrepl.d  VALPHA, $sp, 112
+ #if defined(TRMMKERNEL)
+     xvfmul.d  D0,   D0,  VALPHA
+-    xvfmul.d  D4,   D4,  VALPHA
+-    xvfmul.d  D8,   D8,  VALPHA
+-    xvfmul.d  D12,  D12, VALPHA
++
++    xvstelm.d D0,  C0,  0x00,    0x00
++    xvstelm.d D0,  C1,  0x00,    0x01
++    xvstelm.d D0,  C2,  0x00,    0x02
++    xvstelm.d D0,  C3,  0x00,    0x03
+ #else
+     /* Load C0  */
+-    xvld      U0,  C0,  0x00
+-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
++    xvldrepl.d     U0,  C0,  0x00
++    xvfmadd.d D4,  D0,  VALPHA,  U0
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvldrepl.d     U1,  C1,  0x00
++    xvfmadd.d D5,  D0,  VALPHA,  U1
+ 
+     /* Load C2  */
+-    xvld      U0,  C2,  0x00
+-    xvfmadd.d D8,  D8,  VALPHA,  U0
++    xvldrepl.d     U2,  C2,  0x00
++    xvfmadd.d D6,  D0,  VALPHA,  U2
+ 
+     /* Load C3  */
+-    xvld      U0,  C3,  0x00
+-    xvfmadd.d D12, D12, VALPHA,  U0
+-#endif   // #if defined(TRMMKERNEL)
++    xvldrepl.d     U3,  C3,  0x00
++    xvfmadd.d D7,  D0,  VALPHA,  U3
+ 
+-    xvstelm.d D0,  C0,  0x00,    0x00
+-    xvstelm.d D4,  C1,  0x00,    0x00
+-    xvstelm.d D8,  C2,  0x00,    0x00
+-    xvstelm.d D12, C3,  0x00,    0x00
++    xvstelm.d D4,  C0,  0x00,    0x00
++    xvstelm.d D5,  C1,  0x00,    0x01
++    xvstelm.d D6,  C2,  0x00,    0x02
++    xvstelm.d D7,  C3,  0x00,    0x03
++#endif   // #if defined(TRMMKERNEL)
+ 
+     /* Add stride for C */
+     addi.d    C0,  C0,  0x08
+@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ /////////////////////////////////////////////////
+ /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
+ 
++    xvldrepl.d  VALPHA, $sp, 112
++
+ .L_N3:
+     andi     J,    N,   2
+     beq      ZERO, J,   .L_N1
+@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     addi.d   L,     OFF,  2
+ #endif
+ #else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
+-
+-    /* Load 16 * 64 from A0
+-     * U0 = {a3,  a2,  a1,  a0}
+-     * U1 = {a7,  a6,  a5,  a4}
+-     * U2 = {a11, a10, a9,  a8}
+-     * U3 = {a15, a14, a13, a12}
+-     */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-    xvfmul.d  D1,  U1, U4
+-    xvfmul.d  D2,  U2, U4
+-    xvfmul.d  D3,  U3, U4
+-
+-    xvldrepl.d     U4, B0, 0x08
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-    xvfmul.d  D5,  U1, U4
+-    xvfmul.d  D6,  U2, U4
+-    xvfmul.d  D7,  U3, U4
+-
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x80
+-    addi.d    B0,  B0, 0x10
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_N3_L7 */
+-    beq       ZERO,TL, .L_N3_L7
+-
+-.L_N3_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-2***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-3***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-4***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-5***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-6***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
+ 
+-           /***8-7***/
+-    /* Load 16 * 64 from A0 */
++    /* Load 16 * 64 from A0
++     * U0 = {a3,  a2,  a1,  a0}
++     * U1 = {a7,  a6,  a5,  a4}
++     * U2 = {a11, a10, a9,  a8}
++     * U3 = {a15, a14, a13, a12}
++     */
+     xvld     U0,   A0,    0x00
+     xvld     U1,   A0,    0x20
+     xvld     U2,   A0,    0x40
+     xvld     U3,   A0,    0x60
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++    xvfmul.d  D2,  U2, U4
++    xvfmul.d  D3,  U3, U4
+ 
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x10
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++    xvfmul.d  D6,  U2, U5
++    xvfmul.d  D7,  U3, U5
+ 
+-           /***8-8***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x80
++    addi.d    B0,  B0, 0x10
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N3_L7 */
++    beq       ZERO,TL, .L_N3_L7
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x10
+ 
++    beq    ZERO,    TL,  .L_N3_TL1_END
++
++.L_N3_TL1: /* TL-- */
++    KERNEL8x16x2
++
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N3_TL1
++.L_N3_TL1_END:
++    KERNEL8x16x2_END
+ 
+ .L_N3_L7:
+     /* if (!(L & 7)) goto L_N3_L0 */
+@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d  D2,  U2, U4, D2
+     xvfmadd.d  D3,  U3, U4, D3
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-    xvfmadd.d  D6,  U2, U4, D6
+-    xvfmadd.d  D7,  U3, U4, D7
+-
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x10
+@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d D3,  D3,  VALPHA,  U3
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvld      U1,  C1,  0x20
+-    xvld      U2,  C1,  0x40
+-    xvld      U3,  C1,  0x60
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
+-    xvfmadd.d D5,  D5,  VALPHA,  U1
+-    xvfmadd.d D6,  D6,  VALPHA,  U2
+-    xvfmadd.d D7,  D7,  VALPHA,  U3
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++    xvfmadd.d D4,  D4,  VALPHA,  U4
++    xvfmadd.d D5,  D5,  VALPHA,  U5
++    xvfmadd.d D6,  D6,  VALPHA,  U6
++    xvfmadd.d D7,  D7,  VALPHA,  U7
+ #endif // #if defined(TRMMKERNEL)
+ 
+     /* Store C0 */
+@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmul.d  D0,  U0, U4
+     xvfmul.d  D1,  U1, U4
+ 
+-    xvldrepl.d     U4, B0, 0x08
++    xvldrepl.d     U5, B0, 0x08
+     /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-    xvfmul.d  D5,  U1, U4
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
+ 
+     /* Add stride for A0 and B0 */
+     addi.d    A0,  A0, 0x40
+@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N3_M8_L7 */
+     beq       ZERO,TL, .L_N3_M8_L7
+ 
+-.L_N3_M8_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    /* Cumulative D0~D15 */
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
+     addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x10
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    beq    ZERO,    TL,  .L_N3_M8_TL1_END
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x10
++.L_N3_M8_TL1: /* TL-- */
++    KERNEL8x8x2
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N3_M8_TL1
++.L_N3_M8_TL1_END:
++    KERNEL8x8x2_END
+ 
+ .L_N3_M8_L7:
+     /* if (!(L & 7)) goto L_N3_M8_L0 */
+@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d  D0,  U0, U4, D0
+     xvfmadd.d  D1,  U1, U4, D1
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-    xvfmadd.d  D5,  U1, U4, D5
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
+ 
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x40
+@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d D1,  D1,  VALPHA,  U1
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvld      U1,  C1,  0x20
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
+-    xvfmadd.d D5,  D5,  VALPHA,  U1
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++    xvfmadd.d D4,  D4,  VALPHA,  U2
++    xvfmadd.d D5,  D5,  VALPHA,  U3
+ #endif // #if defined(TRMMKERNEL)
+ 
+     /* Store C0 */
+@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     add.d     B0,  B0, T0
+ #endif
+ 
+-#ifdef LEFT
+-    addi.d    OFF,   OFF,  0x08
+-#endif
+-#endif   // #if defined(TRMMKERNEL)
+-
+-/********LOOP (if(N & 2) && (M & 8) ) End************/
+-
+-.L_N3_M4:
+-    andi      I,   M,   4
+-    beq       ZERO,I,   .L_N3_M2
+-
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    move     B0,    B
+-#else
+-    slli.d   T0,    OFF,  0x05
+-    add.d    A0,    A0,   T0
+-    slli.d   T0,    OFF,  0x04
+-    add.d    B0,    B,    T0
+-#endif
+-
+-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+-    sub.d    L,     K,    OFF
+-#elif defined(LEFT)
+-    /* number of values in A */
+-    addi.d   L,     OFF,  4
+-#else
+-    /* number of values in B */
+-    addi.d   L,     OFF,  2
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
+-
+-    /* Load 4 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-
+-    xvldrepl.d     U4, B0, 0x08
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x20
+-    addi.d    B0,  B0, 0x10
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_N3_M4_L7 */
+-    beq       ZERO,TL, .L_N3_M4_L7
+-
+-.L_N3_M4_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 8 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
++#ifdef LEFT
++    addi.d    OFF,   OFF,  0x08
++#endif
++#endif   // #if defined(TRMMKERNEL)
+ 
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
++/********LOOP (if(N & 2) && (M & 8) ) End************/
+ 
+-    /* Cumulative D0~D15 */
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++.L_N3_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_N3_M2
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    slli.d   T0,    OFF,  0x05
++    add.d    A0,    A0,   T0
++    slli.d   T0,    OFF,  0x04
++    add.d    B0,    B,    T0
++#endif
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    sub.d    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    addi.d   L,     OFF,  4
++#else
++    /* number of values in B */
++    addi.d   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
+ 
+-           /***8-6***/
++    /* Load 4 * 64 from A0 */
+     xvld     U0,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
+ 
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x20
++    addi.d    B0,  B0, 0x10
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N3_M4_L7 */
++    beq       ZERO,TL, .L_N3_M4_L7
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
+     addi.d     A0,  A0, 0x20
+     addi.d     B0,  B0, 0x10
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    beq    ZERO,    TL,  .L_N3_M4_TL1_END
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x10
++.L_N3_M4_TL1: /* TL-- */
++    KERNEL8x4x2
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N3_M4_TL1
++.L_N3_M4_TL1_END:
++    KERNEL8x4x2_END
+ 
+ .L_N3_M4_L7:
+     /* if (!(L & 7)) goto L_N3_M4_L0 */
+@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x20
+@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvld      U1,  C1,  0x00
++    xvfmadd.d D4,  D4,  VALPHA,  U1
+ #endif // #if defined(TRMMKERNEL)
+ 
+     /* Store C0 */
+@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N3_M2_L7 */
+     beq       ZERO,TL, .L_N3_M2_L7
+ 
+-.L_N3_M2_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 2 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
++    xvld     U8,   A0,    0x00
+ 
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
+     addi.d     A0,  A0, 0x10
+     addi.d     B0,  B0, 0x10
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    beq    ZERO,    TL,  .L_N3_M2_TL1_END
+ 
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x10
++.L_N3_M2_TL1: /* TL-- */
++    KERNEL8x2x2
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N3_M2_TL1
++.L_N3_M2_TL1_END:
++    KERNEL8x2x2_END
+ 
+ .L_N3_M2_L7:
+     /* if (!(L & 7)) goto L_N3_M2_L0 */
+@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x10
+@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvld      U1,  C1,  0x00
++    xvfmadd.d D4,  D4,  VALPHA,  U1
+ #endif // #if defined(TRMMKERNEL)
+ 
+     xvstelm.d D0,  C0,  0x00,    0x00
+@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #else
+     /* number of values in B */
+     addi.d   L,     OFF,  2
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
+-
+-    /* Load 1 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-
+-    xvldrepl.d     U4, B0, 0x08
+-    /* line 2 */
+-    xvfmul.d  D4,  U0, U4
+-
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x08
+-    addi.d    B0,  B0, 0x10
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_N3_M1_L7 */
+-    beq       ZERO,TL, .L_N3_M1_L7
+-
+-.L_N3_M1_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 1 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
+ 
+-           /***8-7***/
++    /* Load 1 * 64 from A0 */
+     xvld     U0,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x10
++    xvldrepl.d     U4, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U4
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x08
++    addi.d    B0,  B0, 0x10
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N3_M1_L7 */
++    beq       ZERO,TL, .L_N3_M1_L7
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
+     addi.d     A0,  A0, 0x08
+     addi.d     B0,  B0, 0x10
+ 
++    beq    ZERO,    TL,  .L_N3_M1_TL1_END
++
++.L_N3_M1_TL1: /* TL-- */
++    KERNEL8x1x2
++
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N3_M1_TL1
++.L_N3_M1_TL1_END:
++    KERNEL8x1x2_END
+ 
+ .L_N3_M1_L7:
+     /* if (!(L & 7)) goto L_N3_M1_L0 */
+@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvldrepl.d U4,  B0, 0x00
+     xvfmadd.d  D0,  U0, U4, D0
+ 
+-    xvldrepl.d U4,  B0, 0x08
+-    xvfmadd.d  D4,  U0, U4, D4
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
+ 
+     /* Add stride for A0, B0 */
+     addi.d     A0,  A0, 0x08
+@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+ 
+     /* Load C1  */
+-    xvld      U0,  C1,  0x00
+-    xvfmadd.d D4,  D4,  VALPHA,  U0
++    xvld      U1,  C1,  0x00
++    xvfmadd.d D4,  D4,  VALPHA,  U1
+ #endif // #if defined(TRMMKERNEL)
+ 
+     xvstelm.d D0,  C0,  0x00,    0x00
+@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N1_L7 */
+     beq       ZERO,TL, .L_N1_L7
+ 
+-.L_N1_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-2***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-3***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-4***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-5***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-6***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-7***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
+     addi.d     A0,  A0, 0x80
+     addi.d     B0,  B0, 0x08
+ 
+-           /***8-8***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-    xvld     U2,   A0,    0x40
+-    xvld     U3,   A0,    0x60
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-    xvfmadd.d  D2,  U2, U4, D2
+-    xvfmadd.d  D3,  U3, U4, D3
+-
+-    addi.d     A0,  A0, 0x80
+-    addi.d     B0,  B0, 0x08
++    beq    ZERO,    TL,  .L_N1_TL1_END
++.L_N1_TL1: /* TL-- */
++    KERNEL8x16x1
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N1_TL1
++.L_N1_TL1_END:
++    KERNEL8x16x1_END
+ 
+ .L_N1_L7:
+     /* if (!(L & 7)) goto L_N1_L0 */
+@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #endif
+     slli.d    T0,  L,  0x07
+     add.d     A0,  A0, T0
+-    slli.d    T0,  L,  0x03
+-    add.d     B0,  B0, T0
+-#endif
+-
+-#ifdef LEFT
+-    addi.d    OFF,   OFF,  0x10
+-#endif
+-#endif   // #if defined(TRMMKERNEL)
+-
+-    addi.d    I,   I,   -1  /* I-- */
+-    blt       ZERO,I,   .L_N1_I1
+-
+-.L_N1_M8:
+-    /* We have done M & 16, considering M=8/4/2/1 */
+-    andi      I,   M,   15
+-    beq       ZERO,I,   .L_N1_M0
+-
+-    andi      I,   M,   8
+-    beq       ZERO,I,   .L_N1_M4
+-
+-#if defined(TRMMKERNEL)
+-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+-    move     B0,    B
+-#else
+-    slli.d   T0,    OFF,  0x06
+-    add.d    A0,    A0,   T0
+-    slli.d   T0,    OFF,  0x03
+-    add.d    B0,    B,    T0
+-#endif
+-
+-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+-    sub.d    L,     K,    OFF
+-#elif defined(LEFT)
+-    /* number of values in A */
+-    addi.d   L,     OFF,  8
+-#else
+-    /* number of values in B */
+-    addi.d   L,     OFF,  1
+-#endif
+-#else   // #if !defined(TRMMKERNEL)
+-    move     B0,    B
+-    move     L,     K /* L = bk */
+-#endif
+-
+-    /* Load 8 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d     U4, B0, 0x00
+-    /* line 1 */
+-    xvfmul.d  D0,  U0, U4
+-    xvfmul.d  D1,  U1, U4
+-
+-    /* Add stride for A0 and B0 */
+-    addi.d    A0,  A0, 0x40
+-    addi.d    B0,  B0, 0x08
+-    /* Reduce L */
+-    addi.d    L,   L,  -1
+-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+-    /* if (TL < 1) goto L_N1_M8_L7 */
+-    beq       ZERO,TL, .L_N1_M8_L7
+-
+-.L_N1_M8_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 16 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
+-
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
++    slli.d    T0,  L,  0x03
++    add.d     B0,  B0, T0
++#endif
+ 
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++#ifdef LEFT
++    addi.d    OFF,   OFF,  0x10
++#endif
++#endif   // #if defined(TRMMKERNEL)
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    addi.d    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_N1_I1
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
++.L_N1_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_N1_M0
+ 
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_N1_M4
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    slli.d   T0,    OFF,  0x06
++    add.d    A0,    A0,   T0
++    slli.d   T0,    OFF,  0x03
++    add.d    B0,    B,    T0
++#endif
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    sub.d    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    addi.d   L,     OFF,  8
++#else
++    /* number of values in B */
++    addi.d   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
+ 
+-           /***8-7***/
++    /* Load 8 * 64 from A0 */
+     xvld     U0,   A0,    0x00
+     xvld     U1,   A0,    0x20
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
+ 
+-    addi.d     A0,  A0, 0x40
+-    addi.d     B0,  B0, 0x08
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x40
++    addi.d    B0,  B0, 0x08
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M8_L7 */
++    beq       ZERO,TL, .L_N1_M8_L7
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-    xvld     U1,   A0,    0x20
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-    xvfmadd.d  D1,  U1, U4, D1
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
+     addi.d     A0,  A0, 0x40
+     addi.d     B0,  B0, 0x08
+ 
++    beq    ZERO,    TL,  .L_N1_M8_TL1_END
++.L_N1_M8_TL1: /* TL-- */
++    KERNEL8x8x1
++
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N1_M8_TL1
+ 
++.L_N1_M8_TL1_END:
++    KERNEL8x8x1_END
++
+ .L_N1_M8_L7:
+     /* if (!(L & 7)) goto L_N1_M8_L0 */
+     andi      TL,  L,   7
+@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N1_M4_L7 */
+     beq       ZERO,TL, .L_N1_M4_L7
+ 
+-.L_N1_M4_TL1: /* TL-- */
+-           /***8-1***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
+     addi.d     A0,  A0, 0x20
+     addi.d     B0,  B0, 0x08
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    beq    ZERO,    TL,  .L_N1_M4_TL1_END
+ 
+-    addi.d     A0,  A0, 0x20
+-    addi.d     B0,  B0, 0x08
++.L_N1_M4_TL1: /* TL-- */
++    KERNEL8x4x1
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N1_M4_TL1
++.L_N1_M4_TL1_END:
++    KERNEL8x4x1_END
+ 
+ .L_N1_M4_L7:
+     /* if (!(L & 7)) goto L_N1_M4_L0 */
+@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N1_M2_L7 */
+     beq       ZERO,TL, .L_N1_M2_L7
+ 
+-.L_N1_M2_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 2 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
+     addi.d     A0,  A0, 0x10
+     addi.d     B0,  B0, 0x08
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    beq    ZERO,    TL,  .L_N1_M2_TL1_END
+ 
+-    addi.d     A0,  A0, 0x10
+-    addi.d     B0,  B0, 0x08
++.L_N1_M2_TL1: /* TL-- */
++    KERNEL8x2x1
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N1_M2_TL1
++.L_N1_M2_TL1_END:
++    KERNEL8x2x1_END
+ 
+ .L_N1_M2_L7:
+     /* if (!(L & 7)) goto L_N1_M2_L0 */
+@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     /* if (TL < 1) goto L_N1_M1_L7 */
+     beq       ZERO,TL, .L_N1_M1_L7
+ 
+-.L_N1_M1_TL1: /* TL-- */
+-           /***8-1***/
+-    /* Load 1 * 64 from A0 */
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-2***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-3***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-4***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-5***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-6***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
+-
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
+-
+-           /***8-7***/
+-    xvld     U0,   A0,    0x00
++    xvld     U8,   A0,    0x00
+ 
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    addi.d    TL,  TL,  -1
+ 
++    xvldrepl.d  U12,   B0,  0x00
+     addi.d     A0,  A0, 0x08
+     addi.d     B0,  B0, 0x08
+ 
+-           /***8-8***/
+-    xvld     U0,   A0,    0x00
+-
+-    xvldrepl.d U4,  B0, 0x00
+-    xvfmadd.d  D0,  U0, U4, D0
++    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+ 
+-    addi.d     A0,  A0, 0x08
+-    addi.d     B0,  B0, 0x08
++.L_N1_M1_TL1: /* TL-- */
++    KERNEL8x1x1
+ 
+     addi.d    TL,  TL, -1 /* TL-- */
+     blt       ZERO,TL, .L_N1_M1_TL1
++.L_N1_M1_TL1_END:
++    KERNEL8x1x1_END
+ 
+ .L_N1_M1_L7:
+     /* if (!(L & 7)) goto L_N1_M1_L0 */
+@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+     LDARG    $r26,  $sp,   24
+     LDARG    $r27,  $sp,   32
+     LD       $f23,  $sp,   40
+-    addi.d   $sp,   $sp,   56
++    LD       $f24,  $sp,   48
++    LD       $f25,  $sp,   56
++    LD       $f26,  $sp,   64
++    LD       $f27,  $sp,   72
++    LD       $f28,  $sp,   80
++    LD       $f29,  $sp,   88
++    LD       $f30,  $sp,   96
++    LD       $f31,  $sp,   104
++    addi.d   $sp,   $sp,   120
+ 
+     jirl    $r0, $r1, 0x0
+ 
+diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
+new file mode 100644
+index 000000000..a49bf9bb1
+--- /dev/null
++++ b/kernel/loongarch64/dgemv_n_8_lasx.S
+@@ -0,0 +1,554 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/07/14 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
++ */
++#define M       $r4
++#define N       $r5
++#define ALPHA   $f0
++#define A       $r7
++#define LDA     $r8
++#define X       $r9
++#define INC_X   $r10
++#define Y       $r11
++#define INC_Y   $r6
++
++#define J       $r12
++#define I       $r13
++#define K       $r14
++#define Y_ORG   $r15
++#define OFFSET  $r16
++#define K_LDA   $r17
++#define M8      $r18
++#define T0      $r19
++#define PA0     $r20
++#define PA1     $r23
++#define PA2     $r24
++#define PA3     $r25
++#define PA4     $r26
++#define PA5     $r27
++#define PA6     $r28
++#define PA7     $r29
++
++#define VALPHA  $xr1
++#define X0      $xr2
++#define X1      $xr3
++#define X2      $xr4
++#define X3      $xr5
++#define X4      $xr6
++#define X5      $xr7
++#define X6      $xr8
++#define X7      $xr9
++#define Y0      $xr10
++#define Y1      $xr11
++#define A0      $xr12
++#define A1      $xr13
++#define A2      $xr14
++#define A3      $xr15
++#define A4      $xr16
++#define A5      $xr17
++#define A6      $xr18
++#define A7      $xr19
++#define A8      $xr20
++#define A9      $xr21
++#define A10     $xr22
++#define A11     $xr23
++#define A12     $xr24
++#define A13     $xr25
++#define A14     $xr26
++#define A15     $xr27
++
++.macro DLOAD_X_8
++    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
++                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
++                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
++.endm
++
++.macro DLOAD_X_4
++    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
++.endm
++
++.macro DLOAD_X_2
++    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
++.endm
++
++.macro DLOAD_X_1
++    GLDREPL xv, d, X0, X, 0x00
++    GMUL xvf, d, X0, X0, VALPHA
++.endm
++
++.macro DLOAD_Y_8
++    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
++.endm
++
++.macro DLOAD_Y_4
++    GLD xv, , Y0, Y, 0
++.endm
++
++.macro DLOAD_Y_1
++    fld.d  $f10,   Y,  0
++.endm
++
++.macro DSTORE_Y_8
++    GST xv, , Y0, Y, 0, Y1, Y, 0x20
++.endm
++
++.macro DSTORE_Y_4
++    GST xv, , Y0, Y, 0
++.endm
++
++.macro DSTORE_Y_1
++    fst.d   $f10,   Y,  0
++.endm
++
++// Unable to use vector load/store ins
++.macro DLOAD_Y_8_GAP
++    fld.d   $f10,   Y,  0
++    fldx.d  $f13,   Y,  INC_Y
++    PTR_ALSL  T0,   INC_Y,  Y,  1
++    fld.d   $f14,   T0, 0
++    fldx.d  $f15,   T0, INC_Y
++    PTR_ALSL  T0,   INC_Y,  Y,  2
++    fld.d   $f11,   T0, 0
++    fldx.d  $f17,   T0, INC_Y
++    PTR_ADD   T0,   T0, INC_Y
++    PTR_ADD   T0,   T0, INC_Y
++    fld.d   $f18,   T0, 0
++    fldx.d  $f19,   T0, INC_Y
++    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
++.endm
++
++.macro DLOAD_Y_4_GAP
++    fld.d   $f10,   Y,  0
++    fldx.d  $f13,   Y,  INC_Y
++    PTR_ALSL  T0,   INC_Y,  Y,  1
++    fld.d   $f14,   T0, 0
++    fldx.d  $f15,   T0, INC_Y
++    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
++.endm
++
++.macro DSTORE_Y_8_GAP
++    xvstelm.d   Y0,     Y,      0,      0
++    PTR_ADD     T0,     Y,      INC_Y
++    xvstelm.d   Y0,     T0,     0,      1
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y0,     T0,     0,      2
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y0,     T0,     0,      3
++
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y1,     T0,     0,      0
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y1,     T0,     0,      1
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y1,     T0,     0,      2
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y1,     T0,     0,      3
++.endm
++
++.macro DSTORE_Y_4_GAP
++    xvstelm.d   Y0,     Y,      0,      0
++    PTR_ADD     T0,     Y,      INC_Y
++    xvstelm.d   Y0,     T0,     0,      1
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y0,     T0,     0,      2
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.d   Y0,     T0,     0,      3
++.endm
++
++.macro DLOAD_X_8_GAP
++    xvldrepl.d  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.d  X1,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X2,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X3,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X4,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X5,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X6,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X7,     T0,     0x00
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
++                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
++.endm
++
++.macro DLOAD_X_4_GAP
++    xvldrepl.d  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.d  X1,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X2,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.d  X3,     T0,     0x00
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
++.endm
++
++.macro DLOAD_X_2_GAP
++    xvldrepl.d  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.d  X1,     T0,     0x00
++    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
++.endm
++
++.macro DGEMV_N_8x8
++    GLD_INC xv, , 0x20,       \
++    A0,  PA0, 0, A1,  PA0, 0, \
++    A2,  PA1, 0, A3,  PA1, 0, \
++    A4,  PA2, 0, A5,  PA2, 0, \
++    A6,  PA3, 0, A7,  PA3, 0, \
++    A8,  PA4, 0, A9,  PA4, 0, \
++    A10, PA5, 0, A11, PA5, 0, \
++    A12, PA6, 0, A13, PA6, 0, \
++    A14, PA7, 0, A15, PA7, 0
++
++    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
++                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
++                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
++                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
++                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
++                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
++                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
++                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
++.endm
++
++.macro DGEMV_N_4x8
++    GLD_INC xv, , 0x20, A0,  PA0, 0, \
++                        A2,  PA1, 0, \
++                        A4,  PA2, 0, \
++                        A6,  PA3, 0, \
++                        A8,  PA4, 0, \
++                        A10, PA5, 0, \
++                        A12, PA6, 0, \
++                        A14, PA7, 0
++
++    GMADD xvf, d, Y0, A0,  X0, Y0, \
++                  Y0, A2,  X1, Y0, \
++                  Y0, A4,  X2, Y0, \
++                  Y0, A6,  X3, Y0, \
++                  Y0, A8,  X4, Y0, \
++                  Y0, A10, X5, Y0, \
++                  Y0, A12, X6, Y0, \
++                  Y0, A14, X7, Y0
++.endm
++
++.macro DGEMV_N_1x8
++    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
++                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
++    GMADD f, d, $f10, $f12, $f2, $f10, \
++                $f10, $f14, $f3, $f10, \
++                $f10, $f16, $f4, $f10, \
++                $f10, $f18, $f5, $f10, \
++                $f10, $f20, $f6, $f10, \
++                $f10, $f22, $f7, $f10, \
++                $f10, $f24, $f8, $f10, \
++                $f10, $f26, $f9, $f10,
++.endm
++
++.macro DGEMV_N_8x4
++    GLD_INC xv, , 0x20,       \
++    A0,  PA0, 0, A1,  PA0, 0, \
++    A2,  PA1, 0, A3,  PA1, 0, \
++    A4,  PA2, 0, A5,  PA2, 0, \
++    A6,  PA3, 0, A7,  PA3, 0
++
++    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
++                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
++                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
++                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
++.endm
++
++.macro DGEMV_N_4x4
++    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
++
++    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
++                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
++.endm
++
++.macro DGEMV_N_1x4
++    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
++    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
++                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
++.endm
++
++.macro DGEMV_N_8x2
++    GLD_INC xv, , 0x20,       \
++    A0,  PA0, 0, A1,  PA0, 0, \
++    A2,  PA1, 0, A3,  PA1, 0
++    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
++                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
++.endm
++
++.macro DGEMV_N_4x2
++    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
++    GMADD xvf, d, Y0, A0,  X0, Y0, \
++                  Y0, A2,  X1, Y0
++.endm
++
++.macro DGEMV_N_1x2
++    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
++    GMADD f, d, $f10, $f12, $f2, $f10, \
++                $f10, $f14, $f3, $f10
++.endm
++
++.macro DGEMV_N_1x1
++    fld.d   $f12,    PA0,    0
++    PTR_ADDI PA0,   PA0,    0x08
++    fmadd.d $f10,   $f12,   $f2,    $f10
++.endm
++
++.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
++    PTR_SRLI  J,      N,      3
++    beqz      J,      .L_\XW\()_N_7
++    PTR_SLLI  K_LDA,  LDA,    3
++    PTR_SUB   K_LDA,  K_LDA,  M8
++.L_\XW\()_N_L8:
++    DLOAD_\X_8
++    xor     K,      K,      K
++    move    Y,      Y_ORG
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_M_7
++.align 5
++.L_\XW\()_M_L8:
++    DLOAD_\Y_8
++    DGEMV_N_8x8
++    DSTORE_\Y_8
++    PTR_ADDI    I,      I,      -1
++    PTR_ALSL    Y,      INC_Y,  Y,  3
++    PTR_ADDI    K,      K,      8
++    bnez        I,      .L_\XW\()_M_L8
++.L_\XW\()_M_7:
++    andi        I,      M,      4
++    beqz        I,      .L_\XW\()_M_3
++    DLOAD_\Y_4
++    DGEMV_N_4x8
++    DSTORE_\Y_4
++    PTR_ALSL    Y,      INC_Y,  Y,  2
++    PTR_ADDI    K,      K,      4
++.L_\XW\()_M_3:
++    andi        I,      M,      3
++    beqz        I,      .L_\XW\()_M_END
++.align 5
++.L_\XW\()_M_L1:
++    DLOAD_\Y_1
++    DGEMV_N_1x8
++    DSTORE_\Y_1
++    PTR_ADDI    I,      I,      -1
++    PTR_ADD     Y,      Y,      INC_Y
++    PTR_ADDI    K,      K,      1
++    bnez        I,      .L_\XW\()_M_L1
++.L_\XW\()_M_END:
++    PTR_ADDI    J,      J,      -1
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#endif
++    PTR_ALSL    X,      INC_X,  X,  3
++    bnez        J,      .L_\XW\()_N_L8
++.L_\XW\()_N_7:
++    andi        J,      N,      4
++    beqz        J,      .L_\XW\()_N_3
++    DLOAD_\X_4
++    xor         K,      K,      K
++    move        Y,      Y_ORG
++
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_N_4_M_7
++.align 5
++.L_\XW\()_N_4_M_L8:
++    DLOAD_\Y_8
++    DGEMV_N_8x4
++    DSTORE_\Y_8
++    PTR_ADDI  I,      I,      -1
++    PTR_ADDI  K,      K,      8
++    PTR_ALSL  Y,      INC_Y,  Y,  3
++    bnez    I,      .L_\XW\()_N_4_M_L8
++.L_\XW\()_N_4_M_7:
++    andi    I,      M,      4
++    beqz    I,      .L_\XW\()_N_4_M_3
++    DLOAD_\Y_4
++    DGEMV_N_4x4
++    DSTORE_\Y_4
++    PTR_ALSL  Y,    INC_Y,  Y,  2
++    PTR_ADDI  K,    K,      4
++.L_\XW\()_N_4_M_3:
++    andi        I,      M,      3
++    beqz        I,      .L_\XW\()_N_4_M_END
++.align 5
++.L_\XW\()_N_4_M_L1:
++    DLOAD_\Y_1
++    DGEMV_N_1x4
++    DSTORE_\Y_1
++    PTR_ADDI    I,      I,      -1
++    PTR_ADD     Y,      Y,      INC_Y
++    PTR_ADDI    K,      K,      1
++    bnez        I,      .L_\XW\()_N_4_M_L1
++.L_\XW\()_N_4_M_END:
++    PTR_SLLI    K_LDA,  LDA,    2
++    PTR_SUB     K_LDA,  K_LDA,  M8
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#endif
++    PTR_ALSL    X,      INC_X,  X,  2
++.L_\XW\()_N_3:
++    andi        J,      N,      2
++    beqz        J,      .L_\XW\()_N_1
++    DLOAD_\X_2
++    xor         K,      K,      K
++    move        Y,      Y_ORG
++    PTR_SRLI    I,      M,       3
++    beqz    I,      .L_\XW\()_N_2_M_7
++.align 5
++.L_\XW\()_N_2_M_L8:
++    DLOAD_\Y_8
++    DGEMV_N_8x2
++    DSTORE_\Y_8
++    PTR_ADDI  I,      I,      -1
++    PTR_ADDI  K,      K,      8
++    PTR_ALSL  Y,      INC_Y,  Y,  3
++    bnez    I,      .L_\XW\()_N_2_M_L8
++.L_\XW\()_N_2_M_7:
++    andi    I,      M,      4
++    beqz    I,      .L_\XW\()_N_2_M_3
++    DLOAD_\Y_4
++    DGEMV_N_4x2
++    DSTORE_\Y_4
++    PTR_ALSL  Y,        INC_Y,      Y,      2
++    PTR_ADDI  K,        K,          4
++.L_\XW\()_N_2_M_3:
++    andi    I,      M,      3
++    beqz    I,      .L_\XW\()_N_2_M_END
++.align 5
++.L_\XW\()_N_2_M_L1:
++    DLOAD_\Y_1
++    DGEMV_N_1x2
++    DSTORE_\Y_1
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   Y,      Y,      INC_Y
++    PTR_ADDI  K,      K,      1
++    bnez    I,      .L_\XW\()_N_2_M_L1
++.L_\XW\()_N_2_M_END:
++    PTR_SLLI    K_LDA,  LDA,    1
++    PTR_SUB     K_LDA,  K_LDA,  M8
++    PTR_ADD     PA0,    PA0,    K_LDA
++    PTR_ADD     PA1,    PA1,    K_LDA
++    PTR_ALSL    X,      INC_X,  X,  1
++.L_\XW\()_N_1:
++    andi    J,      N,      1
++    beqz    J,      .L_END
++    DLOAD_\X_1
++    xor     K,      K,      K
++    move    Y,      Y_ORG
++    move    I,      M
++    beqz    I,      .L_END
++.align 5
++.L_\XW\()_N_1_M_L1:
++    DLOAD_\Y_1
++    DGEMV_N_1x1
++    DSTORE_\Y_1
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   Y,      Y,      INC_Y
++    PTR_ADDI  K,      K,      1
++    bnez    I,      .L_\XW\()_N_1_M_L1
++    b .L_END
++.endm
++
++    PROLOGUE
++    PTR_LD     INC_Y,  $sp,    0
++    push_if_used 17 + 7, 24 + 4
++    PTR_ADDI   K,      $r0,     0x01
++    PTR_SUB    I,      INC_X,   K
++    PTR_SUB    J,      INC_Y,   K
++    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
++    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
++    PTR_ALSL   I,      I,       J,      1
++    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
++    xvreplve0.d     VALPHA, $xr0
++    move     Y_ORG,  Y
++    move     PA0,    A
++#if __loongarch_grlen == 64
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#else
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#endif
++    la.local    T0,     .L_GAP_TABLE
++    PTR_ALSL    I,      I,      T0,     1
++    ld.h        K,      I,      0
++    PTR_ADD     T0,     T0,     K
++    jirl        $r0,    T0,     0
++.L_GAP_TABLE:
++    .hword  .L_GAP_0_0 - .L_GAP_TABLE
++    .hword  .L_GAP_0_1 - .L_GAP_TABLE
++    .hword  .L_GAP_1_0 - .L_GAP_TABLE
++    .hword  .L_GAP_1_1 - .L_GAP_TABLE
++.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
++    DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
++.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
++    DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
++.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
++    DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
++.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
++    DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
++.L_END:
++    pop_if_used 17 + 7, 24 + 4
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
+new file mode 100644
+index 000000000..71f942b0f
+--- /dev/null
++++ b/kernel/loongarch64/dgemv_t_8_lasx.S
+@@ -0,0 +1,481 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/07/17 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
++ */
++#define M       $r4
++#define N       $r5
++#define ALPHA   $f0
++#define A       $r7
++#define LDA     $r8
++#define X       $r9
++#define INC_X   $r10
++#define Y       $r11
++#define INC_Y   $r6
++
++#define J       $r12
++#define I       $r13
++#define K       $r14
++#define PY0     $r14
++#define X_ORG   $r15
++#define PY1     $r16
++#define K_LDA   $r17
++#define PY2     $r18
++#define T0      $r19
++#define PA0     $r20
++#define PA1     $r23
++#define PA2     $r24
++#define PA3     $r25
++#define PA4     $r26
++#define PA5     $r27
++#define PA6     $r28
++#define PA7     $r29
++#define M8      $r30
++
++#define VALPHA  $xr0
++#define X0      $xr1
++#define X1      $xr2
++#define A0      $xr3
++#define A1      $xr4
++#define A2      $xr5
++#define A3      $xr6
++#define A4      $xr7
++#define A5      $xr8
++#define A6      $xr9
++#define A7      $xr10
++#define A8      $xr11
++#define A9      $xr12
++#define A10     $xr13
++#define A11     $xr14
++#define A12     $xr15
++#define A13     $xr16
++#define A14     $xr17
++#define A15     $xr18
++#define TP0     $xr19
++#define TP1     $xr20
++#define TP2     $xr21
++#define TP3     $xr22
++#define TP4     $xr23
++#define TP5     $xr24
++#define TP6     $xr25
++#define TP7     $xr26
++#define Y0      $xr3
++#define Y1      $xr4
++#define Y2      $xr5
++#define Y3      $xr6
++#define Y4      $xr7
++#define Y5      $xr8
++#define Y6      $xr9
++#define Y7      $xr10
++
++.macro ZERO_Y8
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
++                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
++.endm
++
++.macro ZERO_Y4
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
++.endm
++
++.macro ZERO_Y2
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
++.endm
++
++.macro ZERO_Y1
++    GXOR xv, v, TP0, TP0, TP0
++.endm
++
++.macro DLOAD_X8
++    GLD xv, , X0, X, 0x00, X1, X, 0x20
++.endm
++
++.macro DLOAD_X4
++    GLD xv, , X0, X, 0x00
++.endm
++
++.macro DLOAD_X8_GAP
++    fld.d       $f1,    X,    0x00
++    fldx.d      $f2,    X,    INC_X
++    PTR_ALSL    T0,     INC_X,      X,      1
++    fld.d       $f3,    T0,   0x00
++    fldx.d      $f4,    T0,   INC_X
++    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
++    PTR_ALSL    T0,     INC_X,      X,      2
++    fld.d       $f2,    T0,   0x00
++    fldx.d      $f3,    T0,   INC_X
++    PTR_ALSL    T0,     INC_X,      T0,     1
++    fld.d       $f4,    T0,   0x00
++    fldx.d      $f5,    T0,   INC_X
++    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
++.endm
++
++.macro DLOAD_X4_GAP
++    fld.d       $f1,    X,    0x00
++    fldx.d      $f2,    X,    INC_X
++    PTR_ALSL    T0,     INC_X,      X,      1
++    fld.d       $f3,    T0,   0x00
++    fldx.d      $f4,    T0,   INC_X
++    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
++.endm
++
++.macro DGEMV_T_8x8
++    GLD_INC xv, , 0x20,       \
++    A0,  PA0, 0, A1,  PA0, 0, \
++    A2,  PA1, 0, A3,  PA1, 0, \
++    A4,  PA2, 0, A5,  PA2, 0, \
++    A6,  PA3, 0, A7,  PA3, 0, \
++    A8,  PA4, 0, A9,  PA4, 0, \
++    A10, PA5, 0, A11, PA5, 0, \
++    A12, PA6, 0, A13, PA6, 0, \
++    A14, PA7, 0, A15, PA7, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
++                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
++                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
++                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
++                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
++                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
++                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
++                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
++.endm
++
++.macro DGEMV_T_8x4
++    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
++                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
++                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
++                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
++                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
++.endm
++
++.macro DGEMV_T_4x8
++    GLD_INC xv, , 0x20,       \
++    A0,  PA0, 0, A1,  PA0, 0, \
++    A2,  PA1, 0, A3,  PA1, 0, \
++    A4,  PA2, 0, A5,  PA2, 0, \
++    A6,  PA3, 0, A7,  PA3, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
++                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
++                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
++                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
++.endm
++
++.macro DGEMV_T_4x4
++    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
++                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
++.endm
++
++.macro DGEMV_T_2x8
++    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
++                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
++.endm
++
++.macro DGEMV_T_2x4
++    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
++
++    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
++.endm
++
++.macro DGEMV_T_LASX XW:req X8:req, X4:req
++    PTR_SRLI  J,      N,      3
++    beqz      J,      .L_\XW\()_N_7
++    PTR_SLLI  K_LDA,  LDA,    3
++    PTR_SUB   K_LDA,  K_LDA,  M8
++.L_\XW\()_N_L8:
++    ZERO_Y8
++    move      X,      X_ORG
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_M_7
++.align 5
++.L_\XW\()_M_L8:
++    DLOAD_\X8
++    DGEMV_T_8x8
++    PTR_ADDI    I,  I,  -1
++    PTR_ALSL    X,  INC_X,  X,  3
++    bnez        I,  .L_\XW\()_M_L8
++.L_\XW\()_M_7:
++    andi        I,      M,      4
++    beqz        I,      .L_\XW\()_M_3
++    DLOAD_\X4
++    DGEMV_T_8x4
++    PTR_ALSL    X,  INC_X,  X,  2
++.L_\XW\()_M_3:
++    // Accumulated
++    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
++                 Y5, TP5, Y6, TP6, Y7, TP7
++    andi        I,      M,      3
++    beqz        I,      .L_\XW\()_M_END
++.align 5
++.L_\XW\()_M_L1:
++    fld.d   $f1,    X,      0x00
++    fld.d   $f11,   PA0,    0x00
++    fld.d   $f12,   PA1,    0x00
++    fld.d   $f13,   PA2,    0x00
++    fld.d   $f14,   PA3,    0x00
++    fld.d   $f15,   PA4,    0x00
++    fld.d   $f16,   PA5,    0x00
++    fld.d   $f17,   PA6,    0x00
++    fld.d   $f18,   PA7,    0x00
++#if __loongarch_grlen == 64
++    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
++               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
++#elif __loongarch_grlen == 32
++    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
++               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
++#else
++    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
++               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
++#endif
++    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
++                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_M_L1
++.L_\XW\()_M_END:
++    fld.d   $f11,   Y,  0x00
++    fldx.d  $f12,   Y,  INC_Y
++    PTR_ALSL    PY0, INC_Y,  Y,  1
++    fld.d   $f13,   PY0,    0x00
++    fldx.d  $f14,   PY0,    INC_Y
++    PTR_ALSL    PY1, INC_Y,  Y,  2
++    fld.d   $f15,   PY1,    0x00
++    fldx.d  $f16,   PY1,    INC_Y
++    PTR_ALSL    PY2, INC_Y,  PY1, 1
++    fld.d   $f17,   PY2,    0x00
++    fldx.d  $f18,   PY2,    INC_Y
++
++    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
++                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
++
++    PTR_ADDI    J,      J,      -1
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#endif
++    fst.d   $f11,   Y,      0x00
++    fstx.d  $f12,   Y,      INC_Y
++    fst.d   $f13,   PY0,    0x00
++    fstx.d  $f14,   PY0,    INC_Y
++    fst.d   $f15,   PY1,    0x00
++    fstx.d  $f16,   PY1,    INC_Y
++    fst.d   $f17,   PY2,    0x00
++    fstx.d  $f18,   PY2,    INC_Y
++    PTR_ALSL    Y,      INC_Y,  Y,  3
++    bnez        J,      .L_\XW\()_N_L8
++.L_\XW\()_N_7:
++    andi        J,      N,      4
++    beqz        J,      .L_\XW\()_N_3
++    ZERO_Y4
++    move        X,      X_ORG
++    PTR_SRLI    I,      M,       3
++    beqz        I,      .L_\XW\()_N_4_M_7
++.align 5
++.L_\XW\()_N_4_M_L8:
++    DLOAD_\X8
++    DGEMV_T_4x8
++    PTR_ADDI  I,      I,      -1
++    PTR_ALSL  X,      INC_X,  X,  3
++    bnez      I,      .L_\XW\()_N_4_M_L8
++.L_\XW\()_N_4_M_7:
++    andi    I,      M,      4
++    beqz    I,      .L_\XW\()_N_4_M_3
++    DLOAD_\X4
++    DGEMV_T_4x4
++    PTR_ALSL  X,      INC_X,  X,  2
++.L_\XW\()_N_4_M_3:
++    // Accumulated
++    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
++    andi        I,      M,      3
++    beqz        I,      .L_\XW\()_N_4_M_END
++.align 5
++.L_\XW\()_N_4_M_L1:
++    fld.d   $f1,    X,      0x00
++    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
++    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_N_4_M_L1
++.L_\XW\()_N_4_M_END:
++    fld.d   $f11,   Y,  0x00
++    fldx.d  $f12,   Y,  INC_Y
++    PTR_ALSL    PY0, INC_Y,  Y,  1
++    fld.d   $f13,   PY0,    0x00
++    fldx.d  $f14,   PY0,    INC_Y
++
++    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
++
++    PTR_SLLI    K_LDA,  LDA,    2
++    PTR_SUB     K_LDA,  K_LDA,  M8
++
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#endif
++    fst.d   $f11,   Y,      0x00
++    fstx.d  $f12,   Y,      INC_Y
++    fst.d   $f13,   PY0,    0x00
++    fstx.d  $f14,   PY0,    INC_Y
++    PTR_ALSL    Y,      INC_Y,  Y,  2
++.L_\XW\()_N_3:
++    andi        J,      N,      2
++    beqz        J,      .L_\XW\()_N_1
++    ZERO_Y2
++    move        X,      X_ORG
++    PTR_SRLI    I,      M,       3
++    beqz        I,      .L_\XW\()_N_2_M_7
++.align 5
++.L_\XW\()_N_2_M_L8:
++    DLOAD_\X8
++    DGEMV_T_2x8
++    PTR_ADDI  I,      I,      -1
++    PTR_ALSL  X,      INC_X,  X,  3
++    bnez      I,      .L_\XW\()_N_2_M_L8
++.L_\XW\()_N_2_M_7:
++    andi    I,      M,      4
++    beqz    I,      .L_\XW\()_N_2_M_3
++    DLOAD_\X4
++    DGEMV_T_2x4
++    PTR_ALSL  X,        INC_X,      X,      2
++.L_\XW\()_N_2_M_3:
++    // Accumulated
++    GACC xvf, d, Y0, TP0, Y1, TP1
++    andi        I,      M,      3
++    beqz        I,      .L_\XW\()_N_2_M_END
++.align 5
++.L_\XW\()_N_2_M_L1:
++    fld.d   $f1,    X,      0x00
++    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
++    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_N_2_M_L1
++.L_\XW\()_N_2_M_END:
++    fld.d   $f11,   Y,  0x00
++    fldx.d  $f12,   Y,  INC_Y
++
++    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
++
++    PTR_SLLI    K_LDA,  LDA,    1
++    PTR_SUB     K_LDA,  K_LDA,  M8
++
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#endif
++    fst.d   $f11,   Y,      0x00
++    fstx.d  $f12,   Y,      INC_Y
++    PTR_ALSL        Y,      INC_Y,  Y,  1
++.L_\XW\()_N_1:
++    andi    J,      N,      1
++    beqz    J,      .L_END
++    ZERO_Y1
++    move    X,      X_ORG
++    move    I,      M
++    beqz    I,      .L_END
++.align 5
++.L_\XW\()_N_1_M_L1:
++    fld.d   $f3,    PA0,    0x00
++    fld.d   $f1,    X,      0x00
++    fmadd.d $f19,   $f3,    $f1,    $f19
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    PTR_ADDI  PA0,    PA0,    0x08
++    bnez      I,      .L_\XW\()_N_1_M_L1
++    fld.d     $f3,    Y,      0x00
++    fmadd.d   $f3,    ALPHA,  $f19,  $f3
++    fst.d     $f3,    Y,      0x00
++    b .L_END
++.endm
++
++    PROLOGUE
++    PTR_LD     INC_Y,  $sp,    0
++    push_if_used 17 + 8, 24 + 3
++    PTR_ADDI   K,      $r0,     0x01
++    PTR_SUB    I,      INC_X,   K
++    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
++    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
++    xvreplve0.d     VALPHA, $xr0
++    move     X_ORG,  X
++    move     PA0,    A
++#if __loongarch_grlen == 64
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#else
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#endif
++    la.local    T0,     .L_GAP_TABLE
++    PTR_ALSL    I,      I,      T0,     1
++    ld.h        K,      I,      0
++    PTR_ADD     T0,     T0,     K
++    jirl        $r0,    T0,     0
++.L_GAP_TABLE:
++    .hword .L_GAP_0 - .L_GAP_TABLE
++    .hword .L_GAP_1 - .L_GAP_TABLE
++.L_GAP_0: /* if (incx == 1) */
++    DGEMV_T_LASX GAP_0, X8, X4
++.L_GAP_1: /* if (incx != 1) */
++    DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
++.L_END:
++    pop_if_used 17 + 8, 24 + 3
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
+new file mode 100644
+index 000000000..3315daccb
+--- /dev/null
++++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
+@@ -0,0 +1,1366 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/07/26 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
++ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
++ */
++#define M      $r4   // param 1: bm
++#define N      $r5   // param 2: bn
++#define K      $r6   // param 3: bk
++#define A      $r7   // param 5: ba
++#define B      $r8   // param 6: bb
++#define C      $r9   // param 7: bc
++#define LDC    $r10  // param 8: ldc
++#define OFFSET $r11  // param 9: offset
++
++/* Cycle control parameters */
++#define I      $r13
++#define J      $r14
++#define L      $r15
++#define TL     $r16
++/* Matrix address */
++#define A0     $r17
++#define B0     $r18
++#define C0     $r19
++#define C1     $r20
++#define C2     $r23
++#define C3     $r24
++#define T0     $r25
++#define T1     $r26
++#define T2     $r27
++#define KK     $r28
++#define AA     $r29
++#define CC     $r30
++#undef  ZERO
++#define ZERO   $r0
++
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++
++/* Prefetch interval */
++#define A_PRE  0x400
++#define B_PRE  0x100
++
++#include "dtrsm_kernel_macro.S"
++
++// By integrating the dgemm and dsolve processes, the following advantages can be obtained:
++// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel)
++// 2. Reduce the storage and retrieval of C data
++// 3. Vectorization of dsolve
++// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size.
++// To achieve finer-grained optimization, 15 scenarios have been addressed:
++// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1.
++
++.macro dsolve_16 N
++// if N = 4 the data layout of C is as follows:
++// U0  U1  U2  U3
++// U4  U5  U6  U7
++// U8  U9  U10 U11
++// U12 U13 U14 U15
++// if N = 2 the dat layout of C is as follows:
++// U0 U1 U2 U3
++// U4 U5 U6 U7
++// if N = 1 the dat layout of C is as follows:
++// U0 U1 U2 U3
++// The matrix A has dimensions of 16x16, and
++// it will be divided into 4 segments for processing.
++
++#define G12 U3
++#define G13 U7
++#define G14 U11
++#define G15 U15
++    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
++    // A
++    // G12 G13 G14 G15
++    // -----------------
++    // 204             | D9
++    // 220 221         | D8 D7
++    // 236 237 238     | D6 D5 D4
++    // 252 253 254 255 | D3 D2 D1 D0
++    PTR_ADDI T0, A0, 252 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI T0, A0, 236 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI T0, A0, 220 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI T0, A0, 204 * 8
++    GLDREPL xv, d, D9, T0, 0
++
++    xvfmul.d    G15,    G15,    D0
++    GNMSUB xvf, d, G14, G15, D1, G14
++    xvfmul.d    G14,    G14,    D4
++    GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13
++    xvfmul.d    G13,    G13,    D7
++    GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12
++    xvfmul.d    G12,    G12,    D9
++    // Store B
++.if \N == 4
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // b48 b49 b50 b51 ... b60 b61 b62 b63
++    GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8
++.elseif \N == 2
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // b24 b25 b26 b27 b28 b29 b30 b31
++    GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8
++.elseif \N == 1
++    // x x x x
++    // x x x x
++    // x x x x
++    // b12 b13 b14 b15
++    GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8
++.endif
++    // Transpose G15 G14 G13 G12
++    GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    // x x x x ... c12 c13 c14 c15
++    // x x x x ... c28 c29 c30 c31
++    // x x x x ... c44 c45 c46 c47
++    // x x x x ... c60 c61 c62 c63
++    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8
++.elseif \N == 2
++    // x x x x ... c12 c13 c14 c15
++    // x x x x ... c28 c29 c30 c31
++    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8
++.elseif \N == 1
++    // Store C
++    // x x x x ... c12 c13 c14 c15
++    GST xv, , D0, C0, 12 * 8
++.endif
++
++#define G8  U2
++#define G9  U6
++#define G10 U10
++#define G11 U14
++    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
++    // A
++    // G8  G9  G10  G11
++    // -----------------
++    // 136             | D9
++    // 152 153         | D8  D7
++    // 168 169 170     | D6  D5  D4
++    // 184 185 186 187 | D3  D2  D1  D0
++    // 200 201 202 203 | D15 D14 D13 D12
++    // 216 217 218 219 | D11 D10 D9  D8
++    // 232 233 234 235 | D7  D6  D5  D4
++    // 248 249 250 251 | D3  D2  D1  D0
++    PTR_ADDI  T0, A0, 248 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 232 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 216 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 200 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G11, G15, D0,  G11, G10, G15, D1,  G10, G9, G15, D2,  G9, G8, G15, D3,  G8, \
++                   G11, G14, D4,  G11, G10, G14, D5,  G10, G9, G14, D6,  G9, G8, G14, D7,  G8, \
++                   G11, G13, D8,  G11, G10, G13, D9,  G10, G9, G13, D10, G9, G8, G13, D11, G8, \
++                   G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8
++    PTR_ADDI T0, A0, 184 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI T0, A0, 168 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI T0, A0, 152 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI T0, A0, 136 * 8
++    GLDREPL xv, d, D9, T0, 0
++
++    xvfmul.d    G11,    G11,    D0
++    GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8
++    xvfmul.d    G10,    G10,    D4
++    GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8
++    xvfmul.d    G9,     G9,     D7
++    GNMSUB xvf, d, G8, G9, D8, G8
++    xvfmul.d    G8,     G8,     D9
++    // Store B
++.if \N == 4
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // b32 b33 b34 b34 ... b44 b45 b46 b47
++    // b48 b49 b50 b51 ... b60 b61 b62 b63
++    GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8
++.elseif \N == 2
++    // x x x x ... x x x x
++    // x x x x ... x x x x
++    // b16 b17 b18 b19 b20 b21 b22 b23
++    // b24 b25 b26 b27 b28 b29 b30 b31
++    GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8
++.elseif \N == 1
++    // x x x x
++    // x x x x
++    // b8  b9  b10 b11
++    // b12 b13 b14 b15
++    GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8
++.endif
++    // Transpose G11 G10 G9 G8
++    GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
++    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
++    // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47
++    // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63
++    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8
++.elseif \N == 2
++    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
++    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
++    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8
++.elseif \N == 1
++    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
++    GST xv, , D0, C0, 8 * 8
++.endif
++
++#define G4 U1
++#define G5 U5
++#define G6 U9
++#define G7 U13
++    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1
++    // A
++    // G4   G5  G6   G7
++    // ------------------
++    // 68               | D9
++    // 84	85          | D8  D7
++    // 100	101	102     | D6  D5  D4
++    // 116	117	118	119 | D3  D2  D1  D0
++    // 132	133	134	135 | D15 D14 D13 D12
++    // 148	149	150	151 | D11 D10 D9  D8
++    // 164	165	166	167 | D7  D6  D5  D4
++    // 180	181	182	183 | D3  D2  D1  D0
++    // 196	197	198	199 | D15 D14 D13 D12
++    // 212	213	214	215 | D11 D10 D9  D8
++    // 228	229	230	231 | D7  D6  D5  D4
++    // 244	245	246	247 | D3  D2  D1  D0
++    PTR_ADDI  T0, A0, 244 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 228 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 212 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 196 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G7, G15, D0,  G7, G6, G15, D1,  G6, G5, G15, D2,  G5, G4, G15, D3,  G4, \
++                   G7, G14, D4,  G7, G6, G14, D5,  G6, G5, G14, D6,  G5, G4, G14, D7,  G4, \
++                   G7, G13, D8,  G7, G6, G13, D9,  G6, G5, G13, D10, G5, G4, G13, D11, G4, \
++                   G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4
++    PTR_ADDI  T0, A0, 180 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 164 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 148 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 132 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G7, G11, D0,  G7, G6, G11, D1,  G6, G5, G11, D2,  G5, G4, G11, D3,  G4, \
++                   G7, G10, D4,  G7, G6, G10, D5,  G6, G5, G10, D6,  G5, G4, G10, D7,  G4, \
++                   G7, G9,  D8,  G7, G6, G9,  D9,  G6, G5, G9,  D10, G5, G4, G9,  D11, G4, \
++                   G7, G8,  D12, G7, G6, G8,  D13, G6, G5, G8,  D14, G5, G4, G8,  D15, G4
++    PTR_ADDI T0, A0, 116 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI T0, A0, 100 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI T0, A0, 84 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI T0, A0, 68 * 8
++    GLDREPL xv, d, D9, T0, 0
++    xvfmul.d    G7,     G7,     D0
++    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
++    xvfmul.d    G6,     G6,     D4
++    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
++    xvfmul.d    G5,     G5,     D7
++    GNMSUB xvf, d, G4, G5, D8, G4
++    xvfmul.d    G4,     G4,     D9
++    // Store B
++.if \N == 4
++    // x x x x ... x x x x
++    // b16 b17 b18 b19 ... b28 b29 b30 b31
++    // b32 b33 b34 b34 ... b44 b45 b46 b47
++    // b48 b49 b50 b51 ... b60 b61 b62 b63
++    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
++.elseif \N == 2
++    // x x x x ... x x x x
++    // b8  b9  b10 b11 b12 b13 b14 b15
++    // b16 b17 b18 b19 b20 b21 b22 b23
++    // b24 b25 b26 b27 b28 b29 b30 b31
++    GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8
++.elseif \N == 1
++    // x x x x
++    // b4  b5  b6  b7
++    // b8  b9  b10 b11
++    // b12 b13 b14 b15
++    GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8
++.endif
++    // Transpose G7 G6 G5 G4
++    GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
++    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
++    // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47
++    // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63
++    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8
++.elseif \N == 2
++    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
++    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
++    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8
++.elseif \N == 1
++    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
++    GST xv, , D0, C0, 4 * 8
++.endif
++
++#define G0 U0
++#define G1 U4
++#define G2 U8
++#define G3 U12
++    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1
++    // A
++    // G0   G1  G2   G3
++    // ------------------
++    // 0                 | D9
++    // 16	17           | D8  D7
++    // 32	33	34       | D6  D5  D4
++    // 48	49	50	51   | D3  D2  D1  D0
++    // 64	65	66	67   | D15 D14 D13 D12
++    // 80	81	82	83   | D11 D10 D9  D8
++    // 96	97	98	99   | D7  D6  D5  D4
++    // 112	113	114	115  | D3  D2  D1  D0
++    // 128	129	130	131  | D15 D14 D13 D12
++    // 144	145	146	147  | D11 D10 D9  D8
++    // 160	161	162	163  | D7  D6  D5  D4
++    // 176	177	178	179  | D3  D2  D1  D0
++    // 192	193	194	195  | D15 D14 D13 D12
++    // 208	209	210	211  | D11 D10 D9  D8
++    // 224	225	226	227  | D7  D6  D5  D4
++    // 240	241	242	243  | D3  D2  D1  D0
++    PTR_ADDI  T0, A0, 240 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 224 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 208 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 192 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G3, G15, D0,  G3, G2, G15, D1,  G2, G1, G15, D2,  G1, G0, G15, D3,  G0, \
++                   G3, G14, D4,  G3, G2, G14, D5,  G2, G1, G14, D6,  G1, G0, G14, D7,  G0, \
++                   G3, G13, D8,  G3, G2, G13, D9,  G2, G1, G13, D10, G1, G0, G13, D11, G0, \
++                   G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0
++    PTR_ADDI  T0, A0, 176 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 160 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 144 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 128 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G3, G11, D0,  G3, G2, G11, D1,  G2, G1, G11, D2,  G1, G0, G11, D3,  G0, \
++                   G3, G10, D4,  G3, G2, G10, D5,  G2, G1, G10, D6,  G1, G0, G10, D7,  G0, \
++                   G3, G9,  D8,  G3, G2, G9,  D9,  G2, G1, G9,  D10, G1, G0,  G9, D11, G0, \
++                   G3, G8,  D12, G3, G2, G8,  D13, G2, G1, G8,  D14, G1, G0,  G8, D15, G0
++    PTR_ADDI  T0, A0, 112 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 96 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 80 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 64 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
++                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
++                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
++                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
++    PTR_ADDI T0, A0, 48 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI T0, A0, 32 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI T0, A0, 16 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI T0, A0, 0 * 8
++    GLDREPL xv, d, D9, T0, 0
++
++    xvfmul.d    G3,     G3,     D0
++    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
++    xvfmul.d    G2,     G2,     D4
++    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
++    xvfmul.d    G1,     G1,     D7
++    GNMSUB  xvf, d, G0, G1, D8, G0
++    xvfmul.d    G0,     G0,     D9
++    // Store B
++.if \N == 4
++    // b0  b1  b2  b3  ... b12 b13 b14 b15
++    // b16 b17 b18 b19 ... b28 b29 b30 b31
++    // b32 b33 b34 b34 ... b44 b45 b46 b47
++    // b48 b49 b50 b51 ... b60 b61 b62 b63
++    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
++.elseif \N == 2
++    // b0  b1  b2  b3  b4  b5  b6  b7
++    // b8  b9  b10 b11 b12 b13 b14 b15
++    // b16 b17 b18 b19 b20 b21 b22 b23
++    // b24 b25 b26 b27 b28 b29 b30 b31
++    GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8
++.elseif \N == 1
++    // b0  b1  b2  b3
++    // b4  b5  b6  b7
++    // b8  b9  b10 b11
++    // b12 b13 b14 b15
++    GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8
++.endif
++    // Transpose C3 C2 C1 C0
++    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    // c0  c1  c2  c3  ... c12 c13 c14 c15
++    // c16 c17 c18 c19 ... c28 c29 c30 c31
++    // c32 c33 c34 c34 ... c44 c45 c46 c47
++    // c48 c49 c50 c51 ... c60 c61 c62 c63
++    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
++.elseif \N == 2
++    // c0  c1  c2  c3  ... c12 c13 c14 c15
++    // c16 c17 c18 c19 ... c28 c29 c30 c31
++    GST xv, , D0, C0, 0, D1, C1, 0
++.elseif \N == 1
++    // c0  c1  c2  c3  ... c12 c13 c14 c15
++    GST xv, , D0, C0, 0
++.endif
++
++#undef G0
++#undef G1
++#undef G2
++#undef G3
++#undef G4
++#undef G5
++#undef G6
++#undef G7
++#undef G8
++#undef G9
++#undef G10
++#undef G11
++#undef G12
++#undef G13
++#undef G14
++#undef G15
++.endm
++
++.macro dsolve_8 N
++// if N = 4 the data layout of C is as follows:
++// U0  U1
++// U2  U3
++// U4  U5
++// U6  U7
++// if N = 2 the dat layout of C is as follows:
++// U0  U1
++// U2  U3
++// if N = 1 the dat layout of C is as follows:
++// U0  U1
++// The matrix A has dimensions of 8x8, and
++// it will be divided into 2 segments for processing.
++
++#define G4 U1
++#define G5 U3
++#define G6 U5
++#define G7 U7
++    // Transpose U7 U5 U3 U1
++    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1
++    // A
++    // G4   G5  G6  G7
++    // ---------------
++    // 36              | D9
++    // 44	45         | D8 D7
++    // 52	53	54     | D6 D5 D4
++    // 60	61	62	63 | D3 D2 D1 D0
++    PTR_ADDI      T0,     A0,     60 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI      T0,     A0,     52 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI      T0,     A0,     44 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI      T0,     A0,     36 * 8
++    GLDREPL xv, d, D9, T0, 0
++
++    xvfmul.d    G7,     G7,     D0
++    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
++    xvfmul.d    G6,     G6,     D4
++    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
++    xvfmul.d    G5,     G5,     D7
++    GNMSUB xvf, d, G4, G5, D8, G4
++    xvfmul.d    G4,     G4,     D9
++    // Store B
++.if \N == 4
++    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
++.elseif \N == 2
++    GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8
++.elseif \N == 1
++    GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8
++.endif
++    // Transpose
++    GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9
++    // Store C
++.if \N == 4
++    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8
++.elseif \N == 2
++    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8
++.elseif \N == 1
++    GST xv, , D4, C0, 4 * 8
++.endif
++
++#define G0 U0
++#define G1 U2
++#define G2 U4
++#define G3 U6
++    // Transpose U6 U4 U2 U0
++    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1
++    // A
++    // G0  G1   G2  G3
++    //-----------------
++    // 0               | D9
++    // 8	9          | D8  D7
++    // 16	17	18     | D6  D5  D4
++    // 24	25	26	27 | D3  D2  D1  D0
++    // 32	33	34	35 | D15 D14 D13 D12
++    // 40	41	42	43 | D11 D10 D9  D8
++    // 48	49	50	51 | D7  D6  D5  D4
++    // 56	57	58	59 | D3  D2  D1  D0
++    PTR_ADDI  T0, A0, 56 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI  T0, A0, 48 * 8
++    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
++    PTR_ADDI  T0, A0, 40 * 8
++    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
++    PTR_ADDI  T0, A0, 32 * 8
++    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
++    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
++                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
++                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
++                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
++    PTR_ADDI T0, A0, 24 * 8
++    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
++    PTR_ADDI T0, A0, 16 * 8
++    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
++    PTR_ADDI T0, A0, 8 * 8
++    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
++    PTR_ADDI T0, A0, 0 * 8
++    GLDREPL xv, d, D9, T0, 0
++
++    xvfmul.d    G3,     G3,     D0
++    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
++    xvfmul.d    G2,     G2,     D4
++    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
++    xvfmul.d    G1,     G1,     D7
++    GNMSUB xvf, d, G0, G1, D8, G0
++    xvfmul.d    G0,     G0,     D9
++    // Store B
++.if \N == 4
++    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
++.elseif \N == 2
++    GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8
++.elseif \N == 1
++    GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8
++.endif
++    // Transpose
++    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
++.elseif \N == 2
++    GST xv, , D0, C0, 0, D1, C1, 0
++.elseif \N == 1
++    GST xv, , D0, C0, 0
++.endif
++
++#undef G0
++#undef G1
++#undef G2
++#undef G3
++#undef G4
++#undef G5
++#undef G6
++#undef G7
++.endm
++
++.macro dsolve_4 N
++// if N = 4 the data layout of C is as follows:
++// U0
++// U1
++// U2
++// U3
++// if N = 2 the dat layout of C is as follows:
++// U0
++// U1
++// if N = 1 the dat layout of C is as follows:
++// U0
++// The matrix A has dimensions of 4x4, and
++// it will be divided into 1 segments for processing.
++
++#define G0 U0
++#define G1 U1
++#define G2 U2
++#define G3 U3
++    // Transpose U3 U2 U1 U0
++    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1
++    // A
++    // G0 G1 G2 G3
++    //-------------
++    // 0           | D9
++    // 4  5        | D8 D7
++    // 8  9  10    | D6 D5 D4
++    // 12 13 14 15 | D3 D2 D1 D0
++    GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \
++                   D6, A0, 8 * 8,  D5, A0, 9 * 8,  D4, A0, 10 * 8, \
++                   D8, A0, 4 * 8,  D7, A0, 5 * 8, \
++                   D9, A0, 0 * 8
++    xvfmul.d    G3,     G3,     D0
++    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
++    xvfmul.d    G2,     G2,     D4
++    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
++    xvfmul.d    G1,     G1,     D7
++    GNMSUB xvf, d, G0, G1, D8, G0
++    xvfmul.d    G0,     G0,     D9
++    // Store B
++.if \N == 4
++    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
++.elseif \N == 2
++    GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8
++.elseif \N == 1
++    GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8
++.endif
++    // Transpose
++    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
++    // Store C
++.if \N == 4
++    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
++.elseif \N == 2
++    GST xv, , D0, C0, 0, D1, C1, 0
++.elseif \N == 1
++    GST xv, , D0, C0, 0
++.endif
++
++#undef G0
++#undef G1
++#undef G2
++#undef G3
++.endm
++
++.macro dsolve_2 N
++#define G0  U2
++#define G1  U3
++    // Transpose
++    GSBUTTERFLY xv, d, G0, G1, U1, U0
++    // A
++    // G0 G1
++    // ------
++    // 0    | D2
++    // 2  3 | D1 D0
++    GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8
++    xvfmul.d    G1,     G1,     D0
++    GNMSUB xvf, d, G0, G1, D1, G0
++    xvfmul.d    G0,     G0,     D2
++    // Store B
++.if \N == 4
++    GST xv, , G0, B0, 0, G1, B0, 4 * 8
++.elseif \N == 2
++    GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8
++.elseif \N == 1
++    GST f, d, $f2, B0, 0, $f3, B0, 8
++.endif
++    // Transpose
++    GSBUTTERFLY xv, d, D0, D1, G1, G0
++    // Store C
++.if \N == 4
++    vst       $vr16,    C0,      0x00
++    vst       $vr17,    C1,      0x00
++    xvstelm.d D0,  C2,  0x00,    0x02
++    xvstelm.d D1,  C3,  0x00,    0x02
++    xvstelm.d D0,  C2,  0x08,    0x03
++    xvstelm.d D1,  C3,  0x08,    0x03
++.elseif \N == 2
++    GST v, , $vr16, C0, 0, $vr17, C1, 0
++.elseif \N == 1
++    GST v, , $vr16, C0, 0
++.endif
++
++#undef G0
++#undef G1
++.endm
++
++.macro dgemm_dsolve_16x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x4_load
++    dgemm_16x4
++    b	.L_dsolve_16x4
++.L_dsolve_16x4_load:
++    // Load C
++    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
++    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
++    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++/********************** solver ******************/
++.L_dsolve_16x4:
++    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
++    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
++    PTR_ADDI    B0,    T2,    -(16 * 4 * 8)
++    dsolve_16 4
++.endm
++
++.macro dgemm_dsolve_1x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_1x4_load
++    dgemm_1x4
++    b   .L_dsolve_1x4
++.L_dsolve_1x4_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    fld.d       $f2,    C2,     0x00
++    fld.d       $f3,    C3,     0x00
++    xvinsve0.d  U0,     U1,     0x01
++    xvinsve0.d  U0,     U2,     0x02
++    xvinsve0.d  U0,     U3,     0x03
++.L_dsolve_1x4:
++    or      A0,     T1,     T1
++    or      B0,     T2,     T2
++    GLDREPL xv, d, D0, A0, -1 * 8
++    GMUL xvf, d, U0, U0, D0
++    // Store C
++    xvstelm.d   U0,     C0,     0x00,       0x00
++    xvstelm.d   U0,     C1,     0x00,       0x01
++    xvstelm.d   U0,     C2,     0x00,       0x02
++    xvstelm.d   U0,     C3,     0x00,       0x03
++    // Store B
++    xvst    U0,     B0,     -32
++.endm
++
++.macro dgemm_dsolve_2x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_2x4_load
++    dgemm_2x4
++    b   .L_dsolve_2x4
++.L_dsolve_2x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++
++    xvpermi.q   U0, U2, 0x02
++    xvpermi.q   U1, U3, 0x02
++/********************** solver ******************/
++.L_dsolve_2x4:
++    PTR_ADDI      A0,     T1,     -(2 * 2 * 8)
++    PTR_ADDI      B0,     T2,     -(2 * 4 * 8)
++    dsolve_2 4
++.endm
++
++.macro dgemm_dsolve_4x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_4x4_load
++    dgemm_4x4
++    b .L_dsolve_4x4
++.L_dsolve_4x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/************** solver *****************/
++.L_dsolve_4x4:
++    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
++    PTR_ADDI      B0,     T2,     -(4 * 4 * 8)
++
++    dsolve_4 4
++.endm
++
++.macro dgemm_dsolve_8x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x4_load
++    dgemm_8x4
++    b .L_dsolve_8x4
++.L_dsolve_8x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++
++    /* Load C2  */
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++
++    /* Load C3  */
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++/********* solver *********/
++.L_dsolve_8x4:
++    PTR_ADDI      A0,     T1,     -(8 * 8 * 8)
++    PTR_ADDI      B0,     T2,     -(8 * 4 * 8)
++
++    dsolve_8 4
++.endm
++
++.macro dgemm_dsolve_4x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_4x2_load
++    dgemm_4x2
++    b .L_dsolve_4x2
++.L_dsolve_4x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_4x2:
++    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
++    PTR_ADDI      B0,     T2,     -(4 * 2 * 8)
++
++    dsolve_4 2
++.endm
++
++.macro dgemm_dsolve_2x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_2x2_load
++    dgemm_2x2
++    b .L_dsolve_2x2
++.L_dsolve_2x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_2x2:
++    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
++    PTR_ADDI     B0,  T2,     -(2 * 2 * 8)
++
++    dsolve_2 2
++.endm
++
++.macro dgemm_dsolve_8x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x2_load
++    dgemm_8x2
++    b .L_dsolve_8x2
++.L_dsolve_8x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++.L_dsolve_8x2:
++    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
++    PTR_ADDI     B0,  T2,     -(8 * 2 * 8)
++
++    dsolve_8 2
++.endm
++
++.macro dgemm_dsolve_16x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x2_load
++    dgemm_16x2
++    b .L_dsolve_16x2
++.L_dsolve_16x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    /* Load C1  */
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++.L_dsolve_16x2:
++    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
++    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
++    PTR_ADDI    B0,    T2,    -(16 * 2 * 8)
++
++    dsolve_16 2
++.endm
++
++.macro dgemm_dsolve_2x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_2x1_load
++    dgemm_2x1
++    b .L_dsolve_2x1
++.L_dsolve_2x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_2x1:
++    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
++    PTR_ADDI     B0,  T2,     -(2 * 1 * 8)
++
++    dsolve_2 1
++.endm
++
++.macro dgemm_dsolve_4x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_4x1_load
++    dgemm_4x1
++    b .L_dsolve_4x1
++.L_dsolve_4x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_4x1:
++    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
++    PTR_ADDI      B0,     T2,     -(4 * 1 * 8)
++
++    dsolve_4 1
++.endm
++
++.macro dgemm_dsolve_8x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x1_load
++    dgemm_8x1
++    b .L_dsolve_8x1
++.L_dsolve_8x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++.L_dsolve_8x1:
++    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
++    PTR_ADDI     B0,  T2,     -(8 * 1 * 8)
++
++    dsolve_8 1
++.endm
++
++.macro dgemm_dsolve_16x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x1_load
++    dgemm_16x1
++    b .L_dsolve_16x1
++.L_dsolve_16x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++.L_dsolve_16x1:
++    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
++    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
++    PTR_ADDI    B0,    T2,    -(16 * 1 * 8)
++
++    dsolve_16 1
++.endm
++
++    PROLOGUE
++    push_if_used 26, 32
++    PTR_SLLI   LDC,   LDC,   3
++    /* if (!(N >> 2)) goto L_N3 */
++    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
++    andi     N,     N,     0x03
++    beq      ZERO,  J,     .L_N3
++.align 5
++.L_J1:
++    PTR_ADDI   J,     J,     -1
++    PTR_ADD    KK,    M,     OFFSET
++
++    andi      I,    M,      15
++    beq       ZERO, I,      .L_M16
++    andi      I,    M,      1
++    beqz      I,    .L_M2
++.L_M1:
++    PTR_ADDI    T0,   M,      -1
++    PTR_SLLI    T0,   T0,     3
++    PTR_MUL     AA,   T0,     K
++    PTR_ADD     AA,   AA,     A
++    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
++    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
++
++    PTR_SLLI   T0,    KK,     5
++    PTR_ADD    B0,    B,      T0 /* b + 4 * kk */
++    PTR_SUB    L,     K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    dgemm_dsolve_1x4
++    PTR_ADDI   KK,    KK,     -1
++.L_M2:
++    andi    I,      M,      2
++    beqz    I,      .L_M4
++    PTR_SRLI  T0,     M,      1
++    PTR_SLLI  T0,     T0,     1
++    PTR_ADDI  T0,     T0,     -2
++    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
++    PTR_SLLI  T1,     KK,     4
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
++    PTR_SLLI  T0,     KK,     5
++    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    dgemm_dsolve_2x4
++    PTR_ADDI  KK,     KK,     -2
++.L_M4:
++    andi    I,      M,      4
++    beqz    I,      .L_M8
++    PTR_SRLI  T0,     M,      2
++    PTR_SLLI  T0,     T0,     2
++    PTR_ADDI  T0,     T0,     -4
++    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
++    PTR_SLLI  T1,     KK,     5
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
++    PTR_SLLI  T0,     KK,     5
++    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    dgemm_dsolve_4x4
++    PTR_ADDI  KK,     KK,     -4
++.L_M8:
++    andi    I,      M,      8
++    beqz    I,      .L_M16
++    PTR_SRLI  T0,     M,      3
++    PTR_SLLI  T0,     T0,     3
++    PTR_ADDI  T0,     T0,     -8
++    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
++    PTR_SLLI  T1,     KK,     6
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
++    PTR_SLLI  T0,     KK,     5
++    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    dgemm_dsolve_8x4
++    PTR_ADDI  KK,     KK,     -8
++.L_M16:
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_M0
++
++    PTR_SRLI   T0,    M,      4
++    PTR_SLLI   T0,    T0,     4
++    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
++    PTR_SLLI   T0,    T0,     3
++    PTR_MUL    AA,    T0,     K
++    PTR_ADD    AA,    A,      AA
++    PTR_ADD    CC,    C,      T0
++.align 5
++.L_I1:
++    PTR_SLLI   T0,    KK,     5
++    PTR_ADD    B0,    B,      T0
++    PTR_SUB    L,     K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_SLLI   T0,    KK,     7
++    PTR_ADD    A0,    AA,     T0
++    dgemm_dsolve_16x4
++    PTR_ADDI   I,     I,      -1
++    PTR_ADDI   KK,    KK,     -16
++    PTR_ADDI   CC,    CC,     -(16 * 8)
++    PTR_SLLI   T0,    K,      7
++    PTR_SUB    AA,    AA,     T0
++    blt      ZERO,  I,      .L_I1
++.L_M0:
++    PTR_SLLI   T0,    K,      3
++    PTR_ALSL   B,     T0,     B,      2 // b += 4 * k;
++    PTR_ALSL   C,     LDC,    C,      2 // c += 4 * ldc
++    blt      ZERO,  J,      .L_J1
++.L_N3:
++    andi    J,      N,      2
++    beq     ZERO,   J,      .L_N1
++
++    PTR_ADD    KK,    M,     OFFSET
++    andi      I,    M,      15
++    beq       ZERO, I,      .L_N3_M16
++    andi      I,    M,      1
++    beqz      I,    .L_N3_M2
++.L_N3_M1:
++    PTR_ADDI    KK,   KK,     -1
++
++    PTR_ADDI    T0,   M,      -1
++    PTR_SLLI    T0,   T0,     3
++    PTR_MUL     AA,   T0,     K
++    PTR_ADD     AA,   AA,     A
++    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
++    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
++
++    PTR_SLLI   T0,    KK,     4
++    PTR_ADD    B0,    B,      T0 /* b + 2 * kk */
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    // dgemm_dsolve_1x2
++    GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0
++    GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0
++    GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8
++.L_N3_M2:
++    andi    I,      M,      2
++    beqz    I,      .L_N3_M4
++    PTR_SRLI  T0,     M,      1
++    PTR_SLLI  T0,     T0,     1
++    PTR_ADDI  T0,     T0,     -2
++    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
++    PTR_SLLI  T1,     KK,     4
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
++    PTR_SLLI  T0,     KK,     4
++    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    dgemm_dsolve_2x2
++    PTR_ADDI  KK,     KK,     -2
++.L_N3_M4:
++    andi    I,      M,      4
++    beqz    I,      .L_N3_M8
++    PTR_SRLI  T0,     M,      2
++    PTR_SLLI  T0,     T0,     2
++    PTR_ADDI  T0,     T0,     -4
++    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
++    PTR_SLLI  T1,     KK,     5
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
++    PTR_SLLI  T0,     KK,     4
++    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    dgemm_dsolve_4x2
++    PTR_ADDI  KK,     KK,     -4
++.L_N3_M8:
++    andi    I,      M,      8
++    beqz    I,      .L_N3_M16
++    PTR_SRLI  T0,     M,      3
++    PTR_SLLI  T0,     T0,     3
++    PTR_ADDI  T0,     T0,     -8
++    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
++    PTR_SLLI  T1,     KK,     6
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
++    PTR_SLLI  T0,     KK,     4
++    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    dgemm_dsolve_8x2
++    PTR_ADDI  KK,     KK,     -8
++.L_N3_M16:
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_N3_M0
++
++    PTR_SRLI   T0,    M,      4
++    PTR_SLLI   T0,    T0,     4
++    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
++    PTR_SLLI   T0,    T0,     3
++    PTR_MUL    AA,    T0,     K
++    PTR_ADD    AA,    A,      AA
++    PTR_ADD    CC,    C,      T0
++.align 5
++.L_N3_I1:
++    PTR_SLLI   T0,    KK,     4
++    PTR_ADD    B0,    B,      T0
++    PTR_SUB    L,     K,      KK
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_SLLI   T0,    KK,     7
++    PTR_ADD    A0,    AA,     T0
++    dgemm_dsolve_16x2
++    PTR_ADDI   I,     I,      -1
++    PTR_ADDI   KK,    KK,     -16
++    PTR_ADDI   CC,    CC,     -(16 * 8)
++    PTR_SLLI   T0,    K,      7
++    PTR_SUB    AA,    AA,     T0
++    blt      ZERO,  I,      .L_N3_I1
++.L_N3_M0:
++    PTR_SLLI   T0,    K,      3
++    PTR_ALSL   B,     T0,     B,      1 // b += 2 * k;
++    PTR_ALSL   C,     LDC,    C,      1 // c += 2 * ldc
++.L_N1:
++    andi    J,      N,      1
++    beq     ZERO,   J,      .L_N0
++
++    PTR_ADD    KK,    M,     OFFSET
++    andi      I,    M,      15
++    beq       ZERO, I,      .L_N1_M16
++    andi      I,    M,      1
++    beqz      I,    .L_N1_M2
++.L_N1_M1:
++    PTR_ADDI    KK,   KK,     -1
++
++    PTR_ADDI    T0,   M,      -1
++    PTR_SLLI    T0,   T0,     3
++    PTR_MUL     AA,   T0,     K
++    PTR_ADD     AA,   AA,     A
++    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
++    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
++
++    PTR_SLLI   T0,    KK,     3
++    PTR_ADD    B0,    B,      T0 /* b + kk */
++    GADD , d, C0, CC, ZERO
++    // dgemm_dsolve_1x1
++    GLD f, d, $f0, A0, 0, $f1, C0, 0
++    GMUL f, d, $f1, $f1, $f0
++    GST f, d, $f1, C0, 0, $f1, B0, 0
++.L_N1_M2:
++    andi    I,      M,      2
++    beqz    I,      .L_N1_M4
++    PTR_SRLI  T0,     M,      1
++    PTR_SLLI  T0,     T0,     1
++    PTR_ADDI  T0,     T0,     -2
++    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
++    PTR_SLLI  T1,     KK,     4
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
++    PTR_SLLI  T0,     KK,     3
++    PTR_ADD   B0,     B,      T0 /* b + kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO
++    dgemm_dsolve_2x1
++    PTR_ADDI  KK,     KK,     -2
++.L_N1_M4:
++    andi    I,      M,      4
++    beqz    I,      .L_N1_M8
++    PTR_SRLI  T0,     M,      2
++    PTR_SLLI  T0,     T0,     2
++    PTR_ADDI  T0,     T0,     -4
++    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
++    PTR_SLLI  T1,     KK,     5
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
++    PTR_SLLI  T0,     KK,     3
++    PTR_ADD   B0,     B,      T0 /* b + kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO
++    dgemm_dsolve_4x1
++    PTR_ADDI  KK,     KK,     -4
++.L_N1_M8:
++    andi    I,      M,      8
++    beqz    I,      .L_N1_M16
++    PTR_SRLI  T0,     M,      3
++    PTR_SLLI  T0,     T0,     3
++    PTR_ADDI  T0,     T0,     -8
++    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
++    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
++    PTR_SLLI  T1,     KK,     6
++    PTR_MUL   AA,     T0,     K
++    PTR_ADD   AA,     AA,     A
++    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
++    PTR_SLLI  T0,     KK,     3
++    PTR_ADD   B0,     B,      T0 /* b +  kk */
++    PTR_SUB   L,      K,      KK
++    GADD , d, C0, CC, ZERO
++    dgemm_dsolve_8x1
++    PTR_ADDI  KK,     KK,     -8
++.L_N1_M16:
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_N1_M0
++
++    PTR_SRLI   T0,    M,      4
++    PTR_SLLI   T0,    T0,     4
++    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
++    PTR_SLLI   T0,    T0,     3
++    PTR_MUL    AA,    T0,     K
++    PTR_ADD    AA,    A,      AA
++    PTR_ADD    CC,    C,      T0
++.align 5
++.L_N1_I1:
++    PTR_SLLI   T0,    KK,     3
++    PTR_ADD    B0,    B,      T0
++    PTR_SUB    L,     K,      KK
++    GADD , d, C0, CC, ZERO
++    PTR_SLLI   T0,    KK,     7
++    PTR_ADD    A0,    AA,     T0
++    dgemm_dsolve_16x1
++    PTR_ADDI   I,     I,      -1
++    PTR_ADDI   KK,    KK,     -16
++    PTR_ADDI   CC,    CC,     -(16 * 8)
++    PTR_SLLI   T0,    K,      7
++    PTR_SUB    AA,    AA,     T0
++    blt      ZERO,  I,      .L_N1_I1
++.L_N1_M0:
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
+new file mode 100644
+index 000000000..0e2cacccf
+--- /dev/null
++++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
+@@ -0,0 +1,959 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/26 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
++ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
++ */
++
++#define M      $r4   // param 1: bm
++#define N      $r5   // param 2: bn
++#define K      $r6   // param 3: bk
++#define A      $r7   // param 5: ba
++#define B      $r8   // param 6: bb
++#define C      $r9   // param 7: bc
++#define LDC    $r10  // param 8: ldc
++#define OFFSET $r11  // param 9: offset
++
++/* Cycle control parameters */
++#define I      $r13
++#define J      $r14
++#define L      $r15
++#define TL     $r16
++/* Matrix address */
++#define A0     $r17
++#define B0     $r18
++#define C0     $r19
++#define C1     $r20
++#define C2     $r23
++#define C3     $r24
++#define T0     $r25
++#define T1     $r26
++#define T2     $r27
++#define KK     $r28
++#define AA     $r29
++#define CC     $r30
++#define BB     B0
++#undef  ZERO
++#define ZERO   $r0
++
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++#define G0     D0
++#define G1     D1
++#define G2     D2
++#define G3     D3
++#define G4     D4
++#define G5     D5
++#define G6     D6
++#define G7     D7
++#define G8     D8
++#define G9     D9
++#define G10    D10
++#define G11    D11
++#define G12    D12
++#define G13    D13
++#define G14    D14
++#define G15    D15
++
++/* Prefetch interval */
++#define A_PRE  0x400
++#define B_PRE  0x100
++
++#include "dtrsm_kernel_macro.S"
++
++.macro ldrepl_macro start, end, stride
++// Load Ux (x = 0...15)
++.if \start <= \end
++    GLDREPL xv, d, $xr\start, A0, \stride * 8
++    ldrepl_macro %start + 1, \end, %stride + 1
++.endif
++.endm
++.macro nmsub_macro start0, end0, start1, reg
++// Gx -= reg * Ux
++.if \start0 <= \end0
++    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
++    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
++.endif
++.endm
++.macro B_st_macro start, end, stride, N
++// Store Gx(x = 16...31)
++.if \start <= \end
++.if \N == 4
++    xvst    $xr\start, B0, \stride * 0x20
++.elseif \N == 2
++    vst     $vr\start, B0, \stride * 0x10
++.elseif \N == 1
++    fst.d   $f\start, B0, \stride * 0x08
++.endif
++    B_st_macro %start + 1, \end, %stride + 1, \N
++.endif
++.endm
++
++.macro dsolve_16 N
++// The data layout of C (4x16) is as follows (store 4 data in each register):
++// U0  U1  U2  U3
++// U4  U5  U6  U7
++// U8  U9  U10 U11
++// U12 U13 U14 U15
++// The first step is to transpose the result of C
++    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
++    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
++    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7
++    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7
++// Now we have the following memory layout of C:
++//     0     1    2   3    ...    15
++// 0 |    |    |    |    |     |     |
++// 1 | G0 | G1 | G2 | G3 | ... | G15 |
++// 2 |    |    |    |    |     |     |
++// 3 |    |    |    |    |     |     |
++// Next we are going to process matrix A with a size of 16x16,
++// using only the upper triangular portion. The memory layout of
++// matrix A is as follows, quite large.
++//0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
++//	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31
++//		34	35	36	37	38	39	40	41	42	43	44	45	46	47
++//			51	52	53	54	55	56	57	58	59	60	61	62	63
++//				68	69	70	71	72	73	74	75	76	77	78	79
++//					85	86	87	88	89	90	91	92	93	94	95
++//						102	103	104	105	106	107	108	109	110	111
++//							119	120	121	122	123	124	125	126	127
++//								136	137	138	139	140	141	142	143
++//									153	154	155	156	157	158	159
++//										170	171	172	173	174	175
++//											187	188	189	190	191
++//												204	205	206	207
++//													221	222	223
++//														238	239
++//															255
++// Sequentially extract data from A in row order
++// Load 0
++    ldrepl_macro 0, 15, 0
++    GMUL xvf, d, G0, G0, U0
++    nmsub_macro 17, 31, 1, G0
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 1
++    ldrepl_macro 1, 15, 0
++    GMUL xvf, d, G1, G1, U1
++    nmsub_macro 18, 31, 2, G1
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 2
++    ldrepl_macro 2, 15, 0
++    GMUL xvf, d, G2, G2, U2
++    nmsub_macro 19, 31, 3, G2
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 3
++    ldrepl_macro 3, 15, 0
++    GMUL xvf, d, G3, G3, U3
++    nmsub_macro 20, 31, 4, G3
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 4
++    ldrepl_macro 4, 15, 0
++    GMUL xvf, d, G4, G4, U4
++    nmsub_macro 21, 31, 5, G4
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 5
++    ldrepl_macro 5, 15, 0
++    GMUL xvf, d, G5, G5, U5
++    nmsub_macro 22, 31, 6, G5
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 6
++    ldrepl_macro 6, 15, 0
++    GMUL xvf, d, G6, G6, U6
++    nmsub_macro 23, 31, 7, G6
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 7
++    ldrepl_macro 7, 15, 0
++    GMUL xvf, d, G7, G7, U7
++    nmsub_macro 24, 31, 8, G7
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 8
++    ldrepl_macro 8, 15, 0
++    GMUL xvf, d, G8, G8, U8
++    nmsub_macro 25, 31, 9, G8
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 9
++    ldrepl_macro 9, 15, 0
++    GMUL xvf, d, G9, G9, U9
++    nmsub_macro 26, 31, 10, G9
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 10
++    ldrepl_macro 10, 15, 0
++    GMUL xvf, d, G10, G10, U10
++    nmsub_macro 27, 31, 11, G10
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 11
++    ldrepl_macro 11, 15, 0
++    GMUL xvf, d, G11, G11, U11
++    nmsub_macro 28, 31, 12, G11
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 12
++    ldrepl_macro 12, 15, 0
++    GMUL xvf, d, G12, G12, U12
++    nmsub_macro 29, 31, 13, G12
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 13
++    ldrepl_macro 13, 15, 0
++    GMUL xvf, d, G13, G13, U13
++    nmsub_macro 30, 31, 14, G13
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 14
++    ldrepl_macro 14, 15, 0
++    GMUL xvf, d, G14, G14, U14
++    nmsub_macro 31, 31, 15, G14
++    PTR_ADDI    A0,      A0,      17 * 8
++// Load 15
++    ldrepl_macro 15, 15, 0
++    GMUL xvf, d, G15, G15, U15
++// Finally, We can store the result.
++// For B, stored sequentially, and  C, first transpose and then store
++    B_st_macro 16, 31, 0, \N
++    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
++    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
++    GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
++    GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1
++.if \N == 4
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
++              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60, \
++              G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \
++              G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60
++.elseif \N == 2
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
++              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60
++.elseif \N == 1
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60
++.endif
++.endm
++
++.macro dgemm_dsolve_16x4
++    bge     ZERO,       KK,     .L_dsolve_16x4_load
++    dgemm_16x4
++    b	.L_dsolve_16x4
++.L_dsolve_16x4_load:
++    // Load C
++    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
++    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
++    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++/********************** solver ******************/
++.L_dsolve_16x4:
++    dsolve_16 4
++.endm
++
++.macro dsolve_8 N
++// The data layout of C (4x8) is as follows (store 4 data in each register):
++// U0  U1
++// U2  U3
++// U4  U5
++// U6  U7
++// The first step is to transpose the result of C
++    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9
++    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9
++// Now we have the following memory layout of C:
++//     0     1    2   3    ...   7
++// 0 |    |    |    |    |     |    |
++// 1 | G0 | G1 | G2 | G3 | ... | G7 |
++// 2 |    |    |    |    |     |    |
++// 3 |    |    |    |    |     |    |
++// Next we are going to process matrix A with a size of 8x8,
++// using only the upper triangular portion. The memory layout of
++// matrix A is as follows:
++//0	1	2	3	4	5	6	7
++//	9	10	11	12	13	14	15
++//		18	19	20	21	22	23
++//			27	28	29	30	31
++//				36	37	38	39
++//					45	46	47
++//						54	55
++//							63
++// Sequentially extract data from A in row order
++// Load 0
++    ldrepl_macro 0, 7, 0
++    GMUL xvf, d, G0, G0, U0
++    nmsub_macro 17, 23, 1, G0
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 1
++    ldrepl_macro 1, 7, 0
++    GMUL xvf, d, G1, G1, U1
++    nmsub_macro 18, 23, 2, G1
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 2
++    ldrepl_macro 2, 7, 0
++    GMUL xvf, d, G2, G2, U2
++    nmsub_macro 19, 23, 3, G2
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 3
++    ldrepl_macro 3, 7, 0
++    GMUL xvf, d, G3, G3, U3
++    nmsub_macro 20, 23, 4, G3
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 4
++    ldrepl_macro 4, 7, 0
++    GMUL xvf, d, G4, G4, U4
++    nmsub_macro 21, 23, 5, G4
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 5
++    ldrepl_macro 5, 7, 0
++    GMUL xvf, d, G5, G5, U5
++    nmsub_macro 22, 23, 6, G5
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 6
++    ldrepl_macro 6, 7, 0
++    GMUL xvf, d, G6, G6, U6
++    nmsub_macro 23, 23, 7, G6
++    PTR_ADDI    A0,      A0,      9 * 8
++// Load 7
++    ldrepl_macro 7, 7, 0
++    GMUL xvf, d, G7, G7, U7
++// Finally, We can store the result.
++// For B, stored sequentially, and  C, first transpose and then store
++    B_st_macro 16, 23, 0, \N
++    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
++    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
++.if \N == 4
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
++              G1, C1, 0x00, G5, C1, 0x20, \
++              G2, C2, 0x00, G6, C2, 0x20, \
++              G3, C3, 0x00, G7, C3, 0x20
++.elseif \N == 2
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
++              G1, C1, 0x00, G5, C1, 0x20
++.elseif \N == 1
++    GST xv, , G0, C0, 0x00, G4, C0, 0x20
++.endif
++.endm
++
++.macro dgemm_dsolve_8x4
++    bge   ZERO, L,	.L_dsolve_8x4_load
++    dgemm_8x4
++    b .L_dsolve_8x4
++.L_dsolve_8x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++
++    /* Load C2  */
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++
++    /* Load C3  */
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++/********* solver *********/
++.L_dsolve_8x4:
++    dsolve_8 4
++.endm
++
++.macro dsolve_4 N
++// The data layout of C (4x4) is as follows (store 4 data in each register):
++// U0
++// U1
++// U2
++// U3
++// The first step is to transpose the result of C
++    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5
++// Now we have the following memory layout of C:
++//     0     1    2   3
++// 0 |    |    |    |    |
++// 1 | G0 | G1 | G2 | G3 |
++// 2 |    |    |    |    |
++// 3 |    |    |    |    |
++// Next we are going to process matrix A with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix A is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from A in row order
++// Load 0
++    ldrepl_macro 0, 3, 0
++    GMUL xvf, d, G0, G0, U0
++    nmsub_macro 17, 19, 1, G0
++    PTR_ADDI    A0,      A0,      5 * 8
++// Load 1
++    ldrepl_macro 1, 3, 0
++    GMUL xvf, d, G1, G1, U1
++    nmsub_macro 18, 19, 2, G1
++    PTR_ADDI    A0,      A0,      5 * 8
++// Load 2
++    ldrepl_macro 2, 3, 0
++    GMUL xvf, d, G2, G2, U2
++    nmsub_macro 19, 19, 3, G2
++    PTR_ADDI    A0,      A0,      5 * 8
++// Load 3
++    ldrepl_macro 3, 3, 0
++    GMUL xvf, d, G3, G3, U3
++// Finally, We can store the result.
++// For B, stored sequentially, and  C, first transpose and then store
++    B_st_macro 16, 19, 0, \N
++    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
++.if \N == 4
++    GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
++.elseif \N == 2
++    GST xv, , G0, C0, 0x00, G1, C1, 0x00
++.elseif \N == 1
++    GST xv, , G0, C0, 0x00
++.endif
++.endm
++
++.macro dgemm_dsolve_4x4
++    bge   ZERO, L,    .L_dsolve_4x4_load
++    dgemm_4x4
++    b .L_dsolve_4x4
++.L_dsolve_4x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/************** solver *****************/
++.L_dsolve_4x4:
++    dsolve_4 4
++.endm
++
++.macro dsolve_2 N
++// Transpose
++    GSBUTTERFLY xv, d, G0, G1, U1, U0
++// Now we have the following memory layout of C:
++//     0     1
++// 0 |    |    |
++// 1 | G0 | G1 |
++// 2 |    |    |
++// 3 |    |    |
++// Next we are going to process matrix A with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix A is as follows:
++//0	1
++//	3
++// Sequentially extract data from A in row order
++// Load 0
++    ldrepl_macro 0, 1, 0
++    GMUL xvf, d, G0, G0, U0
++    nmsub_macro 17, 17, 1, G0
++    PTR_ADDI    A0,      A0,      3 * 8
++// Load 1
++    ldrepl_macro 1, 1, 0
++    GMUL xvf, d, G1, G1, U1
++// Finally, We can store the result.
++// For B, stored sequentially, and  C, first transpose and then store
++    B_st_macro 16, 17, 0, \N
++    GSBUTTERFLY xv, d, U0, U1, G1, G0
++.if \N == 4
++    vst       $vr0,     C0,      0x00
++    vst       $vr1,     C1,      0x00
++    xvstelm.d U0,  C2,  0x00,    0x02
++    xvstelm.d U1,  C3,  0x00,    0x02
++    xvstelm.d U0,  C2,  0x08,    0x03
++    xvstelm.d U1,  C3,  0x08,    0x03
++.elseif \N == 2
++    vst       $vr0,     C0,      0x00
++    vst       $vr1,     C1,      0x00
++.elseif \N == 1
++    vst       $vr0,     C0,      0x00
++.endif
++.endm
++
++.macro dgemm_dsolve_2x4
++    bge   ZERO, L,    .L_dsolve_2x4_load
++    dgemm_2x4
++    b   .L_dsolve_2x4
++.L_dsolve_2x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++
++    xvpermi.q   U0, U2, 0x02
++    xvpermi.q   U1, U3, 0x02
++/********************** solver ******************/
++.L_dsolve_2x4:
++    dsolve_2 4
++.endm
++
++.macro dgemm_dsolve_1x4
++    bge   ZERO, L,    .L_dsolve_1x4_load
++    dgemm_1x4
++    b   .L_dsolve_1x4
++.L_dsolve_1x4_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    fld.d       $f2,    C2,     0x00
++    fld.d       $f3,    C3,     0x00
++    xvinsve0.d  U0,     U1,     0x01
++    xvinsve0.d  U0,     U2,     0x02
++    xvinsve0.d  U0,     U3,     0x03
++.L_dsolve_1x4:
++    GLDREPL xv, d, D0, A0, 0x00
++    GMUL xvf, d, U0, U0, D0
++    // Store C
++    xvstelm.d   U0,     C0,     0x00,       0x00
++    xvstelm.d   U0,     C1,     0x00,       0x01
++    xvstelm.d   U0,     C2,     0x00,       0x02
++    xvstelm.d   U0,     C3,     0x00,       0x03
++    // Store B
++    xvst    U0,     B0,     0x00
++.endm
++
++.macro dgemm_dsolve_16x2
++    bge   ZERO, L,	.L_dsolve_16x2_load
++    dgemm_16x2
++    b .L_dsolve_16x2
++.L_dsolve_16x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    /* Load C1  */
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++.L_dsolve_16x2:
++    dsolve_16 2
++.endm
++
++.macro dgemm_dsolve_8x2
++    bge   ZERO, L,	.L_dsolve_8x2_load
++    dgemm_8x2
++    b .L_dsolve_8x2
++.L_dsolve_8x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++.L_dsolve_8x2:
++    dsolve_8 2
++.endm
++
++.macro dgemm_dsolve_4x2
++    bge   ZERO, L,	.L_dsolve_4x2_load
++    dgemm_4x2
++    b .L_dsolve_4x2
++.L_dsolve_4x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_4x2:
++    dsolve_4 2
++.endm
++
++.macro dgemm_dsolve_1x2
++    bge   ZERO, L,    .L_dsolve_1x2_load
++    dgemm_1x2
++    b   .L_dsolve_1x2
++.L_dsolve_1x2_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    xvinsve0.d  U0,     U1,     0x01
++.L_dsolve_1x2:
++    GLDREPL xv, d, D0, A0, 0x00
++    GMUL xvf, d, U0, U0, D0
++    // Store C
++    xvstelm.d   U0,     C0,     0x00,       0x00
++    xvstelm.d   U0,     C1,     0x00,       0x01
++    // Store B
++    vst    $vr0,     B0,     0x00
++.endm
++
++.macro dgemm_dsolve_2x2
++    bge   ZERO, L,	.L_dsolve_2x2_load
++    dgemm_2x2
++    b .L_dsolve_2x2
++.L_dsolve_2x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_2x2:
++    dsolve_2 2
++.endm
++
++.macro dgemm_dsolve_16x1
++    bge   ZERO, L,	.L_dsolve_16x1_load
++    dgemm_16x1
++    b .L_dsolve_16x1
++.L_dsolve_16x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++.L_dsolve_16x1:
++    dsolve_16 1
++.endm
++
++.macro dgemm_dsolve_8x1
++    bge   ZERO, L,	.L_dsolve_8x1_load
++    dgemm_8x1
++    b .L_dsolve_8x1
++.L_dsolve_8x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++.L_dsolve_8x1:
++    dsolve_8 1
++.endm
++
++.macro dgemm_dsolve_4x1
++    bge   ZERO, L,	.L_dsolve_4x1_load
++    dgemm_4x1
++    b .L_dsolve_4x1
++.L_dsolve_4x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_4x1:
++    dsolve_4 1
++.endm
++
++.macro dgemm_dsolve_2x1
++    bge   ZERO, L,	.L_dsolve_2x1_load
++    dgemm_2x1
++    b .L_dsolve_2x1
++.L_dsolve_2x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_2x1:
++    dsolve_2 1
++.endm
++
++.macro dgemm_dsolve_1x1
++    bge   ZERO, L,    .L_dsolve_1x1_load
++    dgemm_1x1
++    b .L_dsolve_1x1
++.L_dsolve_1x1_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++.L_dsolve_1x1:
++    GLDREPL xv, d, D0, A0, 0x00
++    GMUL xvf, d, U0, U0, D0
++    // Store C
++    xvstelm.d   U0,     C0,     0x00,       0x00
++    // Store B
++    xvstelm.d   U0,     B0,     0x00,       0x00
++.endm
++
++    PROLOGUE
++    push_if_used 26, 32
++    PTR_SLLI   LDC,   LDC,   3
++    /* if (!(N >> 2)) goto L_N3 */
++    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
++    andi       N,     N,     0x03
++    beq        ZERO,  J,     .L_N3
++.align 5
++.L_J1:
++    PTR_ADDI    J,      J,     -1
++    move        KK,     OFFSET
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_M15
++.align 4
++.L_I1:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x4
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADDI    KK,     KK,     0x10 // kk += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_I1
++.L_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_M7
++.L_M8:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x4
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADDI    KK,     KK,     0x08 // kk += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_M3
++.L_M4:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x4
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADDI    KK,     KK,     0x04 // kk += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_M1
++.L_M2:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x4
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADDI    KK,     KK,     0x02 // kk += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_M0
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x4
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADDI    KK,     KK,     0x01 // kk += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_M0:
++    PTR_SLLI    T0,     K,      5
++    PTR_SLLI    T1,     LDC,    2
++    PTR_ADD     B,      B,      T0 // b += 4 * k
++    PTR_ADD     C,      C,      T1 // c += 4 * ldc
++    bnez        J,      .L_J1
++.L_N3:
++    andi    J,      N,      2
++    beq     ZERO,   J,      .L_N1
++.L_N2:
++    move        KK,     OFFSET
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N2_M15
++.align 4
++.L_N2_I1:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x2
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADDI    KK,     KK,     0x10 // kk += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N2_I1
++.L_N2_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N2_M7
++.L_N2_M8:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x2
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADDI    KK,     KK,     0x08 // kk += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N2_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N2_M3
++.L_N2_M4:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x2
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADDI    KK,     KK,     0x04 // kk += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N2_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N2_M1
++.L_N2_M2:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x2
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADDI    KK,     KK,     0x02 // kk += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N2_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N2_M0
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x2
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADDI    KK,     KK,     0x01 // kk += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N2_M0:
++    PTR_SLLI    T0,     K,      4
++    PTR_SLLI    T1,     LDC,    1
++    PTR_ADD     B,      B,      T0 // b += 2 * k
++    PTR_ADD     C,      C,      T1 // c += 2 * ldc
++.L_N1:
++    andi    J,      N,      1
++    beq     ZERO,   J,      .L_N0
++
++    move        KK,     OFFSET
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N1_M15
++.align 4
++.L_N1_I1:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x1
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADDI    KK,     KK,     0x10 // kk += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N1_I1
++.L_N1_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N1_M7
++.L_N1_M8:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x1
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADDI    KK,     KK,     0x08 // kk += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N1_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N1_M3
++.L_N1_M4:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x1
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADDI    KK,     KK,     0x04 // kk += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N1_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N1_M1
++.L_N1_M2:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x1
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADDI    KK,     KK,     0x02 // kk += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N1_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N1_M0
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x1
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADDI    KK,     KK,     0x01 // kk += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N1_M0:
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
+new file mode 100644
+index 000000000..421339736
+--- /dev/null
++++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
+@@ -0,0 +1,882 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/09/26 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
++ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
++ */
++
++#define M      $r4   // param 1: bm
++#define N      $r5   // param 2: bn
++#define K      $r6   // param 3: bk
++#define A      $r7   // param 5: ba
++#define B      $r8   // param 6: bb
++#define C      $r9   // param 7: bc
++#define LDC    $r10  // param 8: ldc
++#define OFFSET $r11  // param 9: offset
++
++/* Cycle control parameters */
++#define I      $r13
++#define J      $r14
++#define L      $r15
++#define TL     $r16
++/* Matrix address */
++#define A0     $r17
++#define B0     $r18
++#define C0     $r19
++#define C1     $r20
++#define C2     $r23
++#define C3     $r24
++#define T0     $r25
++#define T1     $r26
++#define T2     $r27
++#define KK     $r28
++#define AA     $r29
++#define CC     $r30
++#define BB     B0
++#undef  ZERO
++#define ZERO   $r0
++
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++#define G0     D0
++#define G1     D1
++#define G2     D2
++#define G3     D3
++#define G4     D4
++#define G5     D5
++#define G6     D6
++#define G7     D7
++#define G8     D8
++#define G9     D9
++#define G10    D10
++#define G11    D11
++#define G12    D12
++#define G13    D13
++#define G14    D14
++#define G15    D15
++
++/* Prefetch interval */
++#define A_PRE  0x400
++#define B_PRE  0x100
++
++#include "dtrsm_kernel_macro.S"
++
++.macro ldrepl_macro start, end, stride
++// Load Ux (x = 0...15)
++.if \start <= \end
++    GLDREPL xv, d, $xr\start, B0, \stride * 8
++    ldrepl_macro %start + 1, \end, %stride + 1
++.endif
++.endm
++
++.macro nmsub_macro start0, end0, start1, reg
++// Ux -= reg * Dx
++.if \start0 <= \end0
++    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
++    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
++.endif
++.endm
++
++.macro A_st_macro start, end, stride, N
++// Store Ux(x = 0...15)
++.if \start <= \end
++.if \N == 4
++    xvst    $xr\start, A0, \stride * 0x20
++.elseif \N == 2
++    vst     $vr\start, A0, \stride * 0x10
++.elseif \N == 1
++    fst.d   $f\start, A0, \stride * 0x08
++.endif
++    A_st_macro %start + 1, \end, %stride + 1, \N
++.endif
++.endm
++
++.macro dsolve_16x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 19, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++    ldrepl_macro 20, 22, 5
++    nmsub_macro 4, 7, 0, D1
++    ldrepl_macro 23, 24, 10
++    GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
++    ldrepl_macro 25, 25, 15
++    nmsub_macro 8, 11, 0, D2
++    nmsub_macro 8, 11, 4, D5
++    GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
++    nmsub_macro 12, 15, 0, D3
++    nmsub_macro 12, 15, 4, D6
++    nmsub_macro 12, 15, 8, D8
++    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
++// Store A
++    A_st_macro 0, 15, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
++              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
++              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
++              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++.endm
++
++.macro dsolve_16x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1
++//	3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 17, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++    ldrepl_macro 18, 18, 3
++    nmsub_macro 4, 7, 0, D1
++    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
++// Store A
++    A_st_macro 0, 7, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
++              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++.endm
++
++.macro dsolve_8x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 19, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++    ldrepl_macro 20, 22, 5
++    nmsub_macro 2, 3, 0, D1
++    ldrepl_macro 23, 24, 10
++    GMUL xvf, d, U2, D4, U2, U3, D4, U3
++    ldrepl_macro 25, 25, 15
++    nmsub_macro 4, 5, 0, D2
++    nmsub_macro 4, 5, 2, D5
++    GMUL xvf, d, U4, D7, U4, U5, D7, U5
++    nmsub_macro 6, 7, 0, D3
++    nmsub_macro 6, 7, 2, D6
++    nmsub_macro 6, 7, 4, D8
++    GMUL xvf, d, U6, D9, U6, U7, D9, U7
++// Store A
++    A_st_macro 0, 7, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
++              U2, C1, 0x00, U3, C1, 0x20, \
++              U4, C2, 0x00, U5, C2, 0x20, \
++              U6, C3, 0x00, U7, C3, 0x20
++.endm
++
++.macro dsolve_8x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1
++//	3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 17, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++    ldrepl_macro 18, 18, 3
++    nmsub_macro 2, 3, 0, D1
++    GMUL xvf, d, U2, D2, U2, U3, D2, U3
++// Store A
++    A_st_macro 0, 3, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
++              U2,  C1, 0x00, U3,  C1, 0x20
++.endm
++
++.macro dsolve_4x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 19, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 20, 22, 5
++    nmsub_macro 1, 1, 0, D1
++    ldrepl_macro 23, 24, 10
++    GMUL xvf, d, U1, D4, U1
++    ldrepl_macro 25, 25, 15
++    nmsub_macro 2, 2, 0, D2
++    nmsub_macro 2, 2, 1, D5
++    GMUL xvf, d, U2, D7, U2
++    nmsub_macro 3, 3, 0, D3
++    nmsub_macro 3, 3, 1, D6
++    nmsub_macro 3, 3, 2, D8
++    GMUL xvf, d, U3, D9, U3
++// Store A
++    A_st_macro 0, 3, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
++.endm
++
++.macro dsolve_4x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1
++//	3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 17, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 18, 18, 3
++    nmsub_macro 1, 1, 0, D1
++    GMUL xvf, d, U1, D2, U1
++// Store A
++    A_st_macro 0, 1, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C1, 0x00
++.endm
++
++.macro dsolve_2x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 19, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 20, 22, 5
++    nmsub_macro 1, 1, 0, D1
++    ldrepl_macro 23, 24, 10
++    GMUL xvf, d, U1, D4, U1
++
++    ldrepl_macro 25, 25, 15
++    nmsub_macro 2, 2, 0, D2
++    nmsub_macro 2, 2, 1, D5
++    GMUL xvf, d, U2, D7, U2
++    nmsub_macro 3, 3, 0, D3
++    nmsub_macro 3, 3, 1, D6
++    nmsub_macro 3, 3, 2, D8
++    GMUL xvf, d, U3, D9, U3
++// Store A
++    A_st_macro 0, 3, 0, 2
++// Store C
++    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
++.endm
++
++.macro dsolve_2x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1
++//	3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 17, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 18, 18, 3
++    nmsub_macro 1, 1, 0, D1
++    GMUL xvf, d, U1, D2, U1
++// Store A
++    A_st_macro 0, 1, 0, 2
++// Store C
++    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
++.endm
++
++.macro dsolve_1x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1	2	3
++//	5	6	7
++//		10	11
++//			15
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 19, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 20, 22, 5
++    nmsub_macro 1, 1, 0, D1
++    ldrepl_macro 23, 24, 10
++    GMUL xvf, d, U1, D4, U1
++
++    ldrepl_macro 25, 25, 15
++    nmsub_macro 2, 2, 0, D2
++    nmsub_macro 2, 2, 1, D5
++    GMUL xvf, d, U2, D7, U2
++    nmsub_macro 3, 3, 0, D3
++    nmsub_macro 3, 3, 1, D6
++    nmsub_macro 3, 3, 2, D8
++    GMUL xvf, d, U3, D9, U3
++// Store A
++    A_st_macro 0, 3, 0, 1
++// Store C
++    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
++.endm
++
++.macro dsolve_1x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0	1
++//	3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 17, 0
++    GMUL xvf, d, U0, D0, U0
++    ldrepl_macro 18, 18, 3
++    nmsub_macro 1, 1, 0, D1
++    GMUL xvf, d, U1, D2, U1
++// Store A
++    A_st_macro 0, 1, 0, 1
++// Store C
++    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
++.endm
++
++.macro dgemm_dsolve_16x4
++    bge   ZERO, L,	.L_dsolve_16x4_load
++    dgemm_16x4
++    b	.L_dsolve_16x4
++.L_dsolve_16x4_load:
++    // Load C
++    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
++    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
++    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++/********************** solver ******************/
++.L_dsolve_16x4:
++    dsolve_16x4
++.endm
++
++.macro dgemm_dsolve_8x4
++    bge   ZERO, L,	.L_dsolve_8x4_load
++    dgemm_8x4
++    b .L_dsolve_8x4
++.L_dsolve_8x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++
++    /* Load C2  */
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++
++    /* Load C3  */
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++/********* solver *********/
++.L_dsolve_8x4:
++    dsolve_8x4
++.endm
++
++.macro dgemm_dsolve_4x4
++    bge   ZERO, L,    .L_dsolve_4x4_load
++    dgemm_4x4
++    b .L_dsolve_4x4
++.L_dsolve_4x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/************** solver *****************/
++.L_dsolve_4x4:
++    dsolve_4x4
++.endm
++
++.macro dgemm_dsolve_2x4
++    bge   ZERO, L,    .L_dsolve_2x4_load
++    dgemm_2x4
++    xvpermi.q   U2,     U0,     0x01
++    xvpermi.q   U3,     U1,     0x01
++    b   .L_dsolve_2x4
++.L_dsolve_2x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/********************** solver ******************/
++.L_dsolve_2x4:
++    dsolve_2x4
++.endm
++
++.macro dgemm_dsolve_1x4
++    bge   ZERO, L,    .L_dsolve_1x4_load
++    dgemm_1x4
++    xvpackod.d  U1,     U0,     U0
++    xvpermi.q   U2,     U0,     0x01
++    xvpermi.q   U3,     U1,     0x01
++    b   .L_dsolve_1x4
++.L_dsolve_1x4_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    fld.d       $f2,    C2,     0x00
++    fld.d       $f3,    C3,     0x00
++.L_dsolve_1x4:
++    dsolve_1x4
++.endm
++
++.macro dgemm_dsolve_16x2
++    bge   ZERO, L,	.L_dsolve_16x2_load
++    dgemm_16x2
++    b .L_dsolve_16x2
++.L_dsolve_16x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    /* Load C1  */
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++.L_dsolve_16x2:
++    dsolve_16x2
++.endm
++
++.macro dgemm_dsolve_8x2
++    bge   ZERO, L,	.L_dsolve_8x2_load
++    dgemm_8x2
++    b .L_dsolve_8x2
++.L_dsolve_8x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++.L_dsolve_8x2:
++    dsolve_8x2
++.endm
++
++.macro dgemm_dsolve_4x2
++    bge   ZERO, L,	.L_dsolve_4x2_load
++    dgemm_4x2
++    b .L_dsolve_4x2
++.L_dsolve_4x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_4x2:
++    dsolve_4x2
++.endm
++
++.macro dgemm_dsolve_2x2
++    bge   ZERO, L,	.L_dsolve_2x2_load
++    dgemm_2x2
++    b .L_dsolve_2x2
++.L_dsolve_2x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_2x2:
++    dsolve_2x2
++.endm
++
++.macro dgemm_dsolve_1x2
++    bge   ZERO, L,    .L_dsolve_1x2_load
++    dgemm_1x2
++    xvpackod.d  U1,     U0,     U0
++    b   .L_dsolve_1x2
++.L_dsolve_1x2_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++.L_dsolve_1x2:
++    dsolve_1x2
++.endm
++
++.macro dgemm_dsolve_16x1
++    bge   ZERO, L,	.L_dsolve_16x1_load
++    dgemm_16x1
++    b .L_dsolve_16x1
++.L_dsolve_16x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++.L_dsolve_16x1:
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++    // Store A
++    A_st_macro 0, 3, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
++.endm
++
++.macro dgemm_dsolve_8x1
++    bge   ZERO, L,	.L_dsolve_8x1_load
++    dgemm_8x1
++    b .L_dsolve_8x1
++.L_dsolve_8x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++.L_dsolve_8x1:
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++    // Store A
++    A_st_macro 0, 1, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20
++.endm
++
++.macro dgemm_dsolve_4x1
++    bge   ZERO, L,	.L_dsolve_4x1_load
++    dgemm_4x1
++    b .L_dsolve_4x1
++.L_dsolve_4x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_4x1:
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00
++.endm
++
++.macro dgemm_dsolve_2x1
++    bge   ZERO, L,	.L_dsolve_2x1_load
++    dgemm_2x1
++    b .L_dsolve_2x1
++.L_dsolve_2x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_2x1:
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 2
++    // Strore C
++    GST v, , $vr0, C0, 0x00
++.endm
++
++.macro dgemm_dsolve_1x1
++    bge   ZERO, L,    .L_dsolve_1x1_load
++    dgemm_1x1
++    b .L_dsolve_1x1
++.L_dsolve_1x1_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++.L_dsolve_1x1:
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 1
++    // Strore C
++    GST f, d, $f0, C0, 0x00
++.endm
++
++    PROLOGUE
++    push_if_used 26, 32
++    PTR_SLLI   LDC,   LDC,   3
++    PTR_SUB    KK,    ZERO,  OFFSET
++    /* if (!(N >> 2)) goto L_N3 */
++    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
++    andi       N,     N,     0x03
++    beq        ZERO,  J,     .L_N3
++.align 5
++.L_J1:
++    PTR_ADDI    J,      J,     -1
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_M15
++.align 4
++.L_I1:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x4
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_I1
++.L_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_M7
++.L_M8:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x4
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_M3
++.L_M4:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x4
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_M1
++.L_M2:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x4
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_M0
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x4
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_M0:
++    PTR_SLLI    T0,     K,      5
++    PTR_SLLI    T1,     LDC,    2
++    PTR_ADD     B,      B,      T0 // b += 4 * k
++    PTR_ADD     C,      C,      T1 // c += 4 * ldc
++    PTR_ADDI    KK,     KK,     4 // kk += 4
++    bnez        J,      .L_J1
++.L_N3:
++    andi    J,      N,      2
++    beq     ZERO,   J,      .L_N1
++.L_N2:
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N2_M15
++.align 4
++.L_N2_I1:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x2
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N2_I1
++.L_N2_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N2_M7
++.L_N2_M8:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x2
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N2_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N2_M3
++.L_N2_M4:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x2
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N2_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N2_M1
++.L_N2_M2:
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x2
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N2_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N2_M0
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x2
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N2_M0:
++    PTR_SLLI    T0,     K,      4
++    PTR_SLLI    T1,     LDC,    1
++    PTR_ADD     B,      B,      T0 // b += 2 * k
++    PTR_ADD     C,      C,      T1 // c += 2 * ldc
++    PTR_ADDI    KK,     KK,     2 // kk += 2
++.L_N1:
++    andi    J,      N,      1
++    beq     ZERO,   J,      .L_N0
++    move        AA,     A
++    move        CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N1_M15
++.align 4
++.L_N1_I1:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_16x1
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N1_I1
++.L_N1_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N1_M7
++.L_N1_M8:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_8x1
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N1_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N1_M3
++.L_N1_M4:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_4x1
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N1_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N1_M1
++.L_N1_M2:
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_2x1
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N1_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N1_M0
++    GADD , d, C0, CC, ZERO
++    move        A0,     AA
++    move        B0,     B
++    move        L,      KK
++    dgemm_dsolve_1x1
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N1_M0:
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
+new file mode 100644
+index 000000000..5f86d75b5
+--- /dev/null
++++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
+@@ -0,0 +1,953 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/09/26 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
++ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
++ */
++#define M      $r4   // param 1: bm
++#define N      $r5   // param 2: bn
++#define K      $r6   // param 3: bk
++#define A      $r7   // param 5: ba
++#define B      $r8   // param 6: bb
++#define C      $r9   // param 7: bc
++#define LDC    $r10  // param 8: ldc
++#define OFFSET $r11  // param 9: offset
++
++/* Cycle control parameters */
++#define I      $r13
++#define J      $r14
++#define L      $r15
++#define TL     $r16
++/* Matrix address */
++#define A0     $r17
++#define B0     $r18
++#define C0     $r19
++#define C1     $r20
++#define C2     $r23
++#define C3     $r24
++#define T0     $r25
++#define T1     $r26
++#define T2     $r27
++#define KK     $r28
++#define AA     $r29
++#define CC     $r30
++#define BB     $r31
++#undef  ZERO
++#define ZERO   $r0
++
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++
++/* Prefetch interval */
++#define A_PRE  0x400
++#define B_PRE  0x100
++
++#include "dtrsm_kernel_macro.S"
++
++.macro ldrepl_macro start, end, stride
++// Load Ux (x = 0...15)
++.if \start <= \end
++    GLDREPL xv, d, $xr\start, B0, \stride * 8
++    ldrepl_macro %start + 1, \end, %stride + 1
++.endif
++.endm
++
++.macro nmsub_macro start0, end0, start1, reg
++// Ux -= reg * Dx
++.if \start0 <= \end0
++    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
++    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
++.endif
++.endm
++
++.macro A_st_macro start, end, stride, N
++// Store Ux(x = 0...15)
++.if \start <= \end
++.if \N == 4
++    xvst    $xr\start, A0, \stride * 0x20
++.elseif \N == 2
++    vst     $vr\start, A0, \stride * 0x10
++.elseif \N == 1
++    fst.d   $f\start, A0, \stride * 0x08
++.endif
++    A_st_macro %start + 1, \end, %stride + 1, \N
++.endif
++.endm
++
++.macro dsolve_16x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//2	 3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 16, 0
++    ldrepl_macro 17, 18, 2
++    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
++    nmsub_macro 0, 3, 4, D1
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++// Store A
++    A_st_macro 0, 7, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
++              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++.endm
++
++.macro dsolve_8x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//2	 3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 16, 0
++    ldrepl_macro 17, 18, 2
++    GMUL xvf, d, U2, D2, U2, U3, D2, U3
++    nmsub_macro 0, 1, 2, D1
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++// Store A
++    A_st_macro 0, 3, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
++              U2,  C1, 0x00, U3,  C1, 0x20
++.endm
++
++.macro dsolve_4x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//2	 3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 16, 0
++    ldrepl_macro 17, 18, 2
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 1, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C1, 0x00
++.endm
++
++.macro dsolve_2x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//2	 3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 16, 0
++    ldrepl_macro 17, 18, 2
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 1, 0, 2
++// Store C
++    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
++.endm
++
++.macro dsolve_1x2
++// We are going to process matrix B with a size of 2x2,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//2	 3
++// Sequentially extract data from B in row order
++    ldrepl_macro 16, 16, 0
++    ldrepl_macro 17, 18, 2
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 1, 0, 1
++// Store C
++    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
++.endm
++
++.macro dsolve_16x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//4	 5
++//8	 9	10
++//12 13	14	15
++// Sequentially extract data from B in row order
++    ldrepl_macro 22, 25, 12
++    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
++    ldrepl_macro 19, 21, 8
++    nmsub_macro 8, 11, 12, D8
++    ldrepl_macro 17, 18, 4
++    GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
++    ldrepl_macro 16, 16, 0
++    nmsub_macro 4, 7, 12, D7
++    nmsub_macro 4, 7, 8, D4
++    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
++    nmsub_macro 0, 3, 12, D6
++    nmsub_macro 0, 3, 8, D3
++    nmsub_macro 0, 3, 4, D1
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++// Store A
++    A_st_macro 0, 15, 0, 4
++// Store C
++    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
++              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
++              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
++              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++.endm
++
++.macro dsolve_8x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//4	 5
++//8	 9	10
++//12 13	14	15
++// Sequentially extract data from B in row order
++    ldrepl_macro 22, 25, 12
++    GMUL xvf, d, U6, D9, U6, U7, D9, U7
++    ldrepl_macro 19, 21, 8
++    nmsub_macro 4, 5, 6, D8
++    ldrepl_macro 17, 18, 4
++    GMUL xvf, d, U4, D5, U4, U5, D5, U5
++    ldrepl_macro 16, 16, 0
++    nmsub_macro 2, 3, 6, D7
++    nmsub_macro 2, 3, 4, D4
++    GMUL xvf, d, U2, D2, U2, U3, D2, U3
++    nmsub_macro 0, 1, 6, D6
++    nmsub_macro 0, 1, 4, D3
++    nmsub_macro 0, 1, 2, D1
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++// Store A
++    A_st_macro 0, 7, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
++              U2, C1, 0x00, U3, C1, 0x20, \
++              U4, C2, 0x00, U5, C2, 0x20, \
++              U6, C3, 0x00, U7, C3, 0x20
++.endm
++
++.macro dsolve_4x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//4	 5
++//8	 9	10
++//12 13	14	15
++// Sequentially extract data from B in row order
++    ldrepl_macro 22, 25, 12
++    GMUL xvf, d, U3, D9, U3
++    ldrepl_macro 19, 21, 8
++    nmsub_macro 2, 2, 3, D8
++    ldrepl_macro 17, 18, 4
++    GMUL xvf, d, U2, D5, U2
++    ldrepl_macro 16, 16, 0
++    nmsub_macro 1, 1, 3, D7
++    nmsub_macro 1, 1, 2, D4
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 3, D6
++    nmsub_macro 0, 0, 2, D3
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 3, 0, 4
++// Store C
++    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
++.endm
++
++.macro dsolve_2x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//4	 5
++//8	 9	10
++//12 13	14	15
++// Sequentially extract data from B in row order
++    ldrepl_macro 22, 25, 12
++    GMUL xvf, d, U3, D9, U3
++    ldrepl_macro 19, 21, 8
++    nmsub_macro 2, 2, 3, D8
++    ldrepl_macro 17, 18, 4
++    GMUL xvf, d, U2, D5, U2
++    ldrepl_macro 16, 16, 0
++    nmsub_macro 1, 1, 3, D7
++    nmsub_macro 1, 1, 2, D4
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 3, D6
++    nmsub_macro 0, 0, 2, D3
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 3, 0, 2
++// Store C
++    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
++.endm
++
++.macro dsolve_1x4
++// We are going to process matrix B with a size of 4x4,
++// using only the upper triangular portion. The memory layout of
++// matrix B is as follows:
++//0
++//4	 5
++//8	 9	10
++//12 13	14	15
++// Sequentially extract data from B in row order
++    ldrepl_macro 22, 25, 12
++    GMUL xvf, d, U3, D9, U3
++    ldrepl_macro 19, 21, 8
++    nmsub_macro 2, 2, 3, D8
++    ldrepl_macro 17, 18, 4
++    GMUL xvf, d, U2, D5, U2
++    ldrepl_macro 16, 16, 0
++    nmsub_macro 1, 1, 3, D7
++    nmsub_macro 1, 1, 2, D4
++    GMUL xvf, d, U1, D2, U1
++    nmsub_macro 0, 0, 3, D6
++    nmsub_macro 0, 0, 2, D3
++    nmsub_macro 0, 0, 1, D1
++    GMUL xvf, d, U0, D0, U0
++// Store A
++    A_st_macro 0, 3, 0, 1
++// Store C
++    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
++.endm
++
++.macro dgemm_dsolve_16x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x1_load
++    dgemm_16x1
++    b .L_dsolve_16x1
++.L_dsolve_16x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++.L_dsolve_16x1:
++    PTR_ADDI    A0,     T1,     -16 * 8
++    PTR_ADDI    B0,     T2,     -1 * 8
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
++    // Store A
++    A_st_macro 0, 3, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
++.endm
++
++.macro dgemm_dsolve_8x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x1_load
++    dgemm_8x1
++    b .L_dsolve_8x1
++.L_dsolve_8x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++.L_dsolve_8x1:
++    PTR_ADDI    A0,     T1,     -8 * 8
++    PTR_ADDI    B0,     T2,     -1 * 8
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0, U1, D0, U1
++    // Store A
++    A_st_macro 0, 1, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00, U1, C0, 0x20
++.endm
++
++.macro dgemm_dsolve_4x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_4x1_load
++    dgemm_4x1
++    b .L_dsolve_4x1
++.L_dsolve_4x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_4x1:
++    PTR_ADDI    A0,     T1,     -4 * 8
++    PTR_ADDI    B0,     T2,     -1 * 8
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 4
++    // Strore C
++    GST xv, , U0, C0, 0x00
++.endm
++
++.macro dgemm_dsolve_2x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_2x1_load
++    dgemm_2x1
++    b .L_dsolve_2x1
++.L_dsolve_2x1_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++.L_dsolve_2x1:
++    PTR_ADDI    A0,     T1,     -2 * 8
++    PTR_ADDI    B0,     T2,     -1 * 8
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 2
++    // Strore C
++    GST v, , $vr0, C0, 0x00
++.endm
++
++.macro dgemm_dsolve_1x1
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_1x1_load
++    dgemm_1x1
++    b .L_dsolve_1x1
++.L_dsolve_1x1_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++.L_dsolve_1x1:
++    PTR_ADDI    A0,     T1,     -1 * 8
++    PTR_ADDI    B0,     T2,     -1 * 8
++    ldrepl_macro 16, 16, 0
++    GMUL xvf, d, U0, D0, U0
++    // Store A
++    A_st_macro 0, 0, 0, 1
++    // Strore C
++    GST f, d, $f0, C0, 0x00
++.endm
++
++.macro dgemm_dsolve_16x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x2_load
++    dgemm_16x2
++    b .L_dsolve_16x2
++.L_dsolve_16x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    /* Load C1  */
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++.L_dsolve_16x2:
++    PTR_ADDI    A0,     T1,     -(16 * 2) * 8
++    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
++    dsolve_16x2
++.endm
++
++.macro dgemm_dsolve_8x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x2_load
++    dgemm_8x2
++    b .L_dsolve_8x2
++.L_dsolve_8x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++.L_dsolve_8x2:
++    PTR_ADDI    A0,     T1,     -(8 * 2) * 8
++    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
++    dsolve_8x2
++.endm
++
++.macro dgemm_dsolve_4x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_4x2_load
++    dgemm_4x2
++    b .L_dsolve_4x2
++.L_dsolve_4x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_4x2:
++    PTR_ADDI    A0,     T1,     -(4 * 2) * 8
++    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
++    dsolve_4x2
++.endm
++
++.macro dgemm_dsolve_2x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_2x2_load
++    dgemm_2x2
++    b .L_dsolve_2x2
++.L_dsolve_2x2_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++.L_dsolve_2x2:
++    PTR_ADDI    A0,     T1,     -(2 * 2) * 8
++    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
++    dsolve_2x2
++.endm
++
++.macro dgemm_dsolve_1x2
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_1x2_load
++    dgemm_1x2
++    xvpackod.d  U1,     U0,     U0
++    b   .L_dsolve_1x2
++.L_dsolve_1x2_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++.L_dsolve_1x2:
++    PTR_ADDI    A0,     T1,     -(1 * 2) * 8
++    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
++    dsolve_1x2
++.endm
++
++.macro dgemm_dsolve_16x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_16x4_load
++    dgemm_16x4
++    b	.L_dsolve_16x4
++.L_dsolve_16x4_load:
++    // Load C
++    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
++    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
++    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++/********************** solver ******************/
++.L_dsolve_16x4:
++    PTR_ADDI    A0,     T1,     -(16 * 4) * 8
++    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
++    dsolve_16x4
++.endm
++
++.macro dgemm_dsolve_8x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,	.L_dsolve_8x4_load
++    dgemm_8x4
++    b .L_dsolve_8x4
++.L_dsolve_8x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++
++    /* Load C2  */
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++
++    /* Load C3  */
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++/********* solver *********/
++.L_dsolve_8x4:
++    PTR_ADDI    A0,     T1,     -(8 * 4) * 8
++    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
++    dsolve_8x4
++.endm
++
++.macro dgemm_dsolve_4x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_4x4_load
++    dgemm_4x4
++    b .L_dsolve_4x4
++.L_dsolve_4x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/************** solver *****************/
++.L_dsolve_4x4:
++    PTR_ADDI    A0,     T1,     -(4 * 4) * 8
++    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
++    dsolve_4x4
++.endm
++
++.macro dgemm_dsolve_2x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_2x4_load
++    dgemm_2x4
++    xvpermi.q   U2,     U0,     0x01
++    xvpermi.q   U3,     U1,     0x01
++    b   .L_dsolve_2x4
++.L_dsolve_2x4_load:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++/********************** solver ******************/
++.L_dsolve_2x4:
++    PTR_ADDI    A0,     T1,     -(2 * 4) * 8
++    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
++    dsolve_2x4
++.endm
++
++.macro dgemm_dsolve_1x4
++    or    T1,   A0,     A0
++    or    T2,   B0,     B0
++    bge   ZERO, L,    .L_dsolve_1x4_load
++    dgemm_1x4
++    xvpackod.d  U1,     U0,     U0
++    xvpermi.q   U2,     U0,     0x01
++    xvpermi.q   U3,     U1,     0x01
++    b   .L_dsolve_1x4
++.L_dsolve_1x4_load:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    fld.d       $f2,    C2,     0x00
++    fld.d       $f3,    C3,     0x00
++.L_dsolve_1x4:
++    PTR_ADDI    A0,     T1,     -(1 * 4) * 8
++    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
++    dsolve_1x4
++.endm
++
++    PROLOGUE
++    push_if_used 26, 32
++    PTR_SLLI    LDC,    LDC,    3
++    PTR_SUB     KK,     N,      OFFSET
++    PTR_MUL     T0,     N,      LDC
++    PTR_MUL     T1,     N,      K
++    PTR_ADD     C,      C,      T0 // c += n * ldc
++    PTR_SLLI    T1,     T1,     3
++    PTR_ADD     B,      B,      T1
++
++    andi        J,      N,      1
++    beqz        J,      .L_N2
++.L_N1:
++    move        AA,     A
++    PTR_SUB     C,      C,      LDC // c -= ldc
++    PTR_SLLI    T0,     K,      3
++    PTR_SLLI    T1,     KK,     3
++    PTR_SUB     B,      B,      T0 // b -= k
++    PTR_ADD     BB,     B,      T1 // bb = b + kk
++    move        CC,     C
++
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N1_M15
++.align 4
++.L_N1_I1:
++    PTR_SLLI    T1,     KK,     7
++    GADD , d, C0, CC, ZERO
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_16x1
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N1_I1
++.L_N1_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N1_M7
++.L_N1_M8:
++    PTR_SLLI    T1,     KK,     6
++    GADD , d, C0, CC, ZERO
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_8x1
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N1_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N1_M3
++.L_N1_M4:
++    PTR_SLLI    T1,     KK,     5
++    GADD , d, C0, CC, ZERO
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_4x1
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N1_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N1_M1
++.L_N1_M2:
++    PTR_SLLI    T1,     KK,     4
++    GADD , d, C0, CC, ZERO
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_2x1
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N1_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N1_M0
++    PTR_SLLI    T1,     KK,     3
++    GADD , d, C0, CC, ZERO
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_1x1
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N1_M0:
++    PTR_ADDI    KK,     KK,     -1
++.L_N2:
++    andi    J,      N,      2
++    beq     ZERO,   J,      .L_N4
++    move    AA,     A
++    PTR_SLLI    T0, LDC,    1
++    PTR_SLLI    T1, K,      4
++    PTR_SLLI    T2, KK,     4
++    PTR_SUB     B,  B,      T1
++    PTR_SUB     C,  C,      T0
++    PTR_ADD     BB, B,      T2
++    move    CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_N2_M15
++.align 4
++.L_N2_I1:
++    PTR_SLLI    T1,     KK,     7
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_16x2
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_N2_I1
++.L_N2_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_N2_M7
++.L_N2_M8:
++    PTR_SLLI    T1,     KK,     6
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_8x2
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_N2_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_N2_M3
++.L_N2_M4:
++    PTR_SLLI    T1,     KK,     5
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_4x2
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_N2_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_N2_M1
++.L_N2_M2:
++    PTR_SLLI    T1,     KK,     4
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_2x2
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_N2_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_N2_M0
++    PTR_SLLI    T1,     KK,     3
++    GADD , d, C0, CC, ZERO, C1, C0, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_1x2
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_N2_M0:
++    PTR_ADDI    KK,     KK,     -2
++.L_N4:
++    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
++    beq        ZERO,  J,     .L_N0
++.align 5
++.L_J1:
++    PTR_ADDI    J,      J,     -1
++    move    AA,     A
++    PTR_SLLI    T0, LDC,    2
++    PTR_SLLI    T1, K,      5
++    PTR_SLLI    T2, KK,     5
++    PTR_SUB     B,  B,      T1
++    PTR_SUB     C,  C,      T0
++    PTR_ADD     BB, B,      T2
++    move    CC,     C
++    PTR_SRAI    I,      M,      4 // M >> 4
++    beqz        I,      .L_M15
++.align 4
++.L_I1:
++    PTR_SLLI    T1,     KK,     7
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_16x4
++    PTR_ADDI    I,      I,      -1
++    PTR_SLLI    T0,     K,      7
++    PTR_ADDI    CC,     CC,     0x80 // cc += 16
++    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
++    bnez        I,      .L_I1
++.L_M15:
++    andi        I,      M,      8
++    beqz        I,      .L_M7
++.L_M8:
++    PTR_SLLI    T1,     KK,     6
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_8x4
++    PTR_SLLI    T0,     K,      6
++    PTR_ADDI    CC,     CC,     0x40 // cc += 8
++    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
++.L_M7:
++    andi        I,      M,      4
++    beqz        I,      .L_M3
++.L_M4:
++    PTR_SLLI    T1,     KK,     5
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_4x4
++    PTR_SLLI    T0,     K,      5
++    PTR_ADDI    CC,     CC,     0x20 // cc += 4
++    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
++.L_M3:
++    andi        I,      M,      2
++    beqz        I,      .L_M1
++.L_M2:
++    PTR_SLLI    T1,     KK,     4
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_2x4
++    PTR_SLLI    T0,     K,      4
++    PTR_ADDI    CC,     CC,     0x10 // cc += 2
++    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
++.L_M1:
++    andi        I,      M,      1
++    beqz        I,      .L_M0
++    PTR_SLLI    T1,     KK,     3
++    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
++    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
++    move        B0,     BB
++    PTR_SUB     L,      K,      KK // L = K - KK
++    dgemm_dsolve_1x4
++    PTR_SLLI    T0,     K,      3
++    PTR_ADDI    CC,     CC,     0x08 // cc += 1
++    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
++.L_M0:
++    PTR_ADDI    KK,     KK,     -4
++    bnez        J,      .L_J1
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S
+new file mode 100644
+index 000000000..88b7121d1
+--- /dev/null
++++ b/kernel/loongarch64/dtrsm_kernel_macro.S
+@@ -0,0 +1,2147 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++
++/************** Dgemm Kernel 16x4 ****************/
++.macro KERNEL2x16x4
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
++    preld      0,   B0, B_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D10, U10, U14, D10
++    xvfmadd.d  D11, U11, U14, D11
++
++    preld      0,   A0, A_PRE
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
++
++    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D14, U10, U15, D14
++    xvfmadd.d  D15, U11, U15, D15
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x20
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvld     U10,   A0,    0x40
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvld     U11,   A0,    0x60
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvldrepl.d U12,  B0, 0x00
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
++    preld      0,   B0, B_PRE
++    xvldrepl.d U13,  B0, 0x08
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
++
++    preld      0,   A0, A_PRE
++    xvldrepl.d U14,  B0, 0x10
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++
++    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x16x4_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
++    preld      0,   B0, B_PRE
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D10, U10, U14, D10
++    xvfmadd.d  D11, U11, U14, D11
++
++    preld      0,   A0, A_PRE
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
++
++    preld      0,   A0, A_PRE + 0x40
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D14, U10, U15, D14
++    xvfmadd.d  D15, U11, U15, D15
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x20
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
++    preld      0,   B0, B_PRE
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
++
++    preld      0,   A0, A_PRE
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++
++    preld      0,   A0, A_PRE + 0x40
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
++.endm
++
++.macro KERNEL8x16x4
++.rept 4
++    KERNEL2x16x4
++.endr
++.endm
++
++.macro KERNEL8x16x4_END
++.rept 3
++    KERNEL2x16x4
++.endr
++    KERNEL2x16x4_END
++.endm
++
++.macro KERNEL2x8x4
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x20
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++
++    xvldrepl.d U12, B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvldrepl.d U13, B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvldrepl.d U14, B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x8x4_END
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++    xvfmadd.d  D9,  U9, U14, D9
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++    xvfmadd.d  D13, U9, U15, D13
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x20
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++.endm
++
++.macro KERNEL8x8x4
++.rept 4
++    KERNEL2x8x4
++.endr
++.endm
++
++.macro KERNEL8x8x4_END
++.rept 3
++    KERNEL2x8x4
++.endr
++    KERNEL2x8x4_END
++.endm
++
++.macro KERNEL2x4x4
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U8, U12, D0
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x20
++
++    xvld     U8,   A0,    0x00
++
++    xvldrepl.d U12, B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    xvldrepl.d U13, B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++
++    xvldrepl.d U14, B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++
++    xvldrepl.d U15,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x4x4_END
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U8, U12, D0
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U8, U14, D8
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U8, U15, D12
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x20
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D12, U0, U7, D12
++.endm
++
++.macro KERNEL8x4x4
++.rept 4
++    KERNEL2x4x4
++.endr
++.endm
++
++.macro KERNEL8x4x4_END
++.rept 3
++    KERNEL2x4x4
++.endr
++    KERNEL2x4x4_END
++.endm
++
++.macro KERNEL2x2x4
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
++
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvfmadd.d   D1,     U9,     U12,    D1
++
++    xvld    U4,  B0, 0x00
++    addi.d  A0,  A0, 0x10
++    addi.d  B0,  B0, 0x20
++
++    xvldrepl.d     U8,   A0,    0x00
++    xvldrepl.d     U9,   A0,    0x08
++
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
++
++    xvld       U12, B0, 0x00
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x2x4_END
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
++
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvfmadd.d   D1,     U9,     U12,    D1
++
++    xvld    U4,  B0, 0x00
++    addi.d  A0,  A0, 0x10
++    addi.d  B0,  B0, 0x20
++
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
++.endm
++
++.macro KERNEL8x2x4
++.rept 4
++    KERNEL2x2x4
++.endr
++.endm
++
++.macro KERNEL8x2x4_END
++.rept 3
++    KERNEL2x2x4
++.endr
++    KERNEL2x2x4_END
++.endm
++
++.macro KERNEL2x1x4
++    xvldrepl.d  U0,     A0,     0x00
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvld        U4,     B0,     0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
++
++    xvldrepl.d  U8,     A0,     0x00
++    xvfmadd.d   D0,     U0,     U4,     D0
++    xvld        U12,    B0,     0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
++.endm
++
++.macro KERNEL2x1x4_END
++    xvldrepl.d  U0,     A0,     0x00
++    xvfmadd.d   D0,     U8,     U12,    D0
++    xvld        U4,     B0,     0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x20
++
++    xvfmadd.d   D0,     U0,     U4,     D0
++.endm
++
++.macro KERNEL8x1x4
++.rept 4
++    KERNEL2x1x4
++.endr
++.endm
++
++.macro KERNEL8x1x4_END
++.rept 3
++    KERNEL2x1x4
++.endr
++    KERNEL2x1x4_END
++.endm
++
++.macro KERNEL2x16x2
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvld     U10,   A0,    0x40
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvld     U11,   A0,    0x60
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++.endm
++
++.macro KERNEL2x16x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvld     U3,   A0,    0x60
++    xvfmadd.d  D6,  U10, U13, D6
++    xvfmadd.d  D7,  U11, U13, D7
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++.endm
++
++.macro KERNEL8x16x2
++.rept 4
++    KERNEL2x16x2
++.endr
++.endm
++
++.macro KERNEL8x16x2_END
++.rept 3
++    KERNEL2x16x2
++.endr
++    KERNEL2x16x2_END
++.endm
++
++.macro KERNEL2x8x2
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x10
++.endm
++
++.macro KERNEL2x8x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D4,  U8, U13, D4
++    xvfmadd.d  D5,  U9, U13, D5
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++.endm
++
++.macro KERNEL8x8x2
++.rept 4
++    KERNEL2x8x2
++.endr
++.endm
++
++.macro KERNEL8x8x2_END
++.rept 3
++    KERNEL2x8x2
++ .endr
++    KERNEL2x8x2_END
++.endm
++
++.macro KERNEL2x4x2
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
++.endm
++
++.macro KERNEL2x4x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
++
++.macro KERNEL8x4x2
++.rept 4
++    KERNEL2x4x2
++.endr
++.endm
++
++.macro KERNEL8x4x2_END
++.rept 3
++    KERNEL2x4x2
++.endr
++    KERNEL2x4x2_END
++.endm
++
++.macro KERNEL2x2x2
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++.endm
++
++.macro KERNEL2x2x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
++
++.macro KERNEL8x2x2
++.rept 4
++    KERNEL2x2x2
++.endr
++.endm
++
++.macro KERNEL8x2x2_END
++.rept 3
++    KERNEL2x2x2
++.endr
++    KERNEL2x2x2_END
++.endm
++
++.macro KERNEL2x1x2
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++
++    xvldrepl.d U12,  B0, 0x00
++    xvldrepl.d U13,  B0, 0x08
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++.endm
++
++.macro KERNEL2x1x2_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D4,  U8, U13, D4
++
++    xvldrepl.d U4,  B0, 0x00
++    xvldrepl.d U5,  B0, 0x08
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D4,  U0, U5, D4
++.endm
++
++.macro KERNEL8x1x2
++.rept 4
++    KERNEL2x1x2
++.endr
++.endm
++
++.macro KERNEL8x1x2_END
++.rept 3
++    KERNEL2x1x2
++.endr
++    KERNEL2x1x2_END
++.endm
++
++.macro KERNEL2x16x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvld     U9,   A0,    0x20
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvld     U10,   A0,    0x40
++    xvld     U11,   A0,    0x60
++
++    xvldrepl.d U12,  B0, 0x00
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
++.endm
++
++.macro KERNEL2x16x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++
++    xvld     U1,   A0,    0x20
++    xvfmadd.d  D2,  U10, U12, D2
++    xvfmadd.d  D3,  U11, U12, D3
++
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x80
++    addi.d     B0,  B0, 0x08
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++.endm
++
++.macro KERNEL8x16x1
++.rept 4
++    KERNEL2x16x1
++.endr
++.endm
++
++.macro KERNEL8x16x1_END
++.rept 3
++    KERNEL2x16x1
++.endr
++    KERNEL2x16x1_END
++.endm
++
++.macro KERNEL2x8x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++    xvld     U1,   A0,    0x20
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvld     U9,   A0,    0x20
++    xvldrepl.d U12,  B0, 0x00
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
++.endm
++
++.macro KERNEL2x8x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvfmadd.d  D1,  U9, U12, D1
++    xvld     U1,   A0,    0x20
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x40
++    addi.d     B0,  B0, 0x08
++
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++.endm
++
++.macro KERNEL8x8x1
++.rept 4
++    KERNEL2x8x1
++.endr
++.endm
++
++.macro KERNEL8x8x1_END
++.rept 3
++    KERNEL2x8x1
++.endr
++    KERNEL2x8x1_END
++.endm
++
++.macro KERNEL2x4x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x08
++.endm
++
++.macro KERNEL2x4x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x20
++    addi.d     B0,  B0, 0x08
++
++    xvfmadd.d  D0,  U0, U4, D0
++.endm
++
++.macro KERNEL8x4x1
++.rept 4
++    KERNEL2x4x1
++.endr
++.endm
++
++.macro KERNEL8x4x1_END
++.rept 3
++    KERNEL2x4x1
++.endr
++    KERNEL2x4x1_END
++.endm
++
++.macro KERNEL2x2x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
++.endm
++
++.macro KERNEL2x2x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x10
++    addi.d     B0,  B0, 0x08
++
++    xvfmadd.d  D0,  U0, U4, D0
++.endm
++
++.macro KERNEL8x2x1
++.rept 4
++    KERNEL2x2x1
++.endr
++.endm
++
++.macro KERNEL8x2x1_END
++.rept 3
++    KERNEL2x2x1
++.endr
++    KERNEL2x2x1_END
++.endm
++
++.macro KERNEL2x1x1
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++
++    xvld     U8,   A0,    0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvldrepl.d U12,  B0, 0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++.endm
++
++.macro KERNEL2x1x1_END
++    xvld     U0,   A0,    0x00
++    xvfmadd.d  D0,  U8, U12, D0
++    xvldrepl.d U4,  B0, 0x00
++
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++
++    xvfmadd.d  D0,  U0, U4, D0
++.endm
++
++.macro KERNEL8x1x1
++.rept 4
++    KERNEL2x1x1
++.endr
++.endm
++
++.macro KERNEL8x1x1_END
++.rept 3
++    KERNEL2x1x1
++.endr
++    KERNEL2x1x1_END
++.endm
++
++.macro dgemm_16x4
++.L_dgemm_16x4: // See dgemm_kernel_16x4.S
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++    xvfmul.d  D2,  U2, U4
++    xvfmul.d  D3,  U3, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++    xvfmul.d  D6,  U2, U5
++    xvfmul.d  D7,  U3, U5
++
++    xvldrepl.d     U6, B0, 0x10
++    /* line 3 */
++    xvfmul.d  D8,  U0, U6
++    xvfmul.d  D9,  U1, U6
++    xvfmul.d  D10, U2, U6
++    xvfmul.d  D11, U3, U6
++
++    xvldrepl.d     U7, B0, 0x18
++    /* line 4 */
++    xvfmul.d  D12, U0, U7
++    xvfmul.d  D13, U1, U7
++    xvfmul.d  D14, U2, U7
++    xvfmul.d  D15, U3, U7
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x80
++    PTR_ADDI    B0,  B0, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_L7 */
++    beq       ZERO,TL, .L_dgemm_16x4_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x20
++
++    beq    ZERO,    TL,  .L_dgemm_16x4_TL1_END
++.align 5
++.L_dgemm_16x4_TL1:
++    KERNEL8x16x4
++    PTR_ADDI  TL,    TL,     -1
++    blt     ZERO,  TL,     .L_dgemm_16x4_TL1
++.L_dgemm_16x4_TL1_END:
++    KERNEL8x16x4_END
++.L_dgemm_16x4_L7:
++    andi      TL,  L,    7
++    beq       TL,  ZERO, .L_dgemm_16x4_L0
++.align 5
++.L_dgemm_16x4_L71:
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++    xvfmadd.d  D10, U2, U6, D10
++    xvfmadd.d  D11, U3, U6, D11
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++    xvfmadd.d  D14, U2, U7, D14
++    xvfmadd.d  D15, U3, U7, D15
++
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x20
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_16x4_L71
++.L_dgemm_16x4_L0:
++    // Load C
++    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
++    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
++    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
++    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
++    GSUB xvf, d, U0,  U0,  D0,  U1,  U1,  D1,  U2,  U2,  D2,  U3,  U3,  D3,  \
++                 U4,  U4,  D4,  U5,  U5,  D5,  U6,  U6,  D6,  U7,  U7,  D7,  \
++                 U8,  U8,  D8,  U9,  U9,  D9,  U10, U10, D10, U11, U11, D11, \
++                 U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
++.endm
++
++.macro dgemm_1x4
++.L_dgemm_1x4: // See dgemm_kernel_16x4.S
++    xvldrepl.d  U0,     A0,     0x00
++    xvld        U4,     B0,     0x00
++    xvfmul.d    D0,     U0,     U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x08
++    PTR_ADDI    B0,  B0, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M1_L7 */
++    beq       ZERO,TL, .L_dgemm_1x4_M1_L7
++    xvldrepl.d  U8,     A0,     0x00
++
++    PTR_ADDI     TL,  TL,  -1
++    xvld       U12, B0,  0x00
++    PTR_ADDI     A0,  A0,  0x08
++    PTR_ADDI     B0,  B0,  0x20
++
++    beq    ZERO,    TL,  .L_dgemm_1x4_M1_TL1_END
++.align 5
++.L_dgemm_1x4_M1_TL1:
++    KERNEL8x1x4
++    PTR_ADDI  TL,     TL,     -1
++    blt     ZERO,TL, .L_dgemm_1x4_M1_TL1
++.L_dgemm_1x4_M1_TL1_END:
++    KERNEL8x1x4_END
++.L_dgemm_1x4_M1_L7:
++    /* if (!(L & 7)) goto L_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_1x4_M1_L0
++.align 5
++.L_dgemm_1x4_M1_L71:
++    xvldrepl.d  U0,     A0,     0x00
++    xvld        U4,     B0,     0x00
++    xvfmadd.d   D0,     U0,     U4,    D0
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x08
++    PTR_ADDI     B0,  B0, 0x20
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_1x4_M1_L71
++.L_dgemm_1x4_M1_L0:
++    // Load C
++    fld.d       $f0,    C0,     0x00
++    fld.d       $f1,    C1,     0x00
++    fld.d       $f2,    C2,     0x00
++    fld.d       $f3,    C3,     0x00
++    xvinsve0.d  U0,     U1,     0x01
++    xvinsve0.d  U0,     U2,     0x02
++    xvinsve0.d  U0,     U3,     0x03
++    GSUB xvf, d, U0, U0, D0
++.endm
++
++.macro dgemm_2x4
++.L_dgemm_2x4:
++    /* Load 2 * 64 from A0 */
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
++    xvld    U4,     B0,     0x00
++    xvfmul.d    D0,     U0,     U4
++    xvfmul.d    D1,     U1,     U4
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x10
++    PTR_ADDI    B0,  B0, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M2_L7 */
++    beq       ZERO,TL, .L_dgemm_2x4_M2_L7
++
++    xvldrepl.d     U8,   A0,    0x00
++    xvldrepl.d     U9,   A0,    0x08
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvld       U12, B0, 0x00
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x20
++
++    beq    ZERO,    TL,  .L_dgemm_2x4_M2_TL1_END
++.align 5
++.L_dgemm_2x4_M2_TL1:
++    KERNEL8x2x4
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_2x4_M2_TL1
++.L_dgemm_2x4_M2_TL1_END:
++    KERNEL8x2x4_END
++
++.L_dgemm_2x4_M2_L7:
++    /* if (!(L & 7)) goto L_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_2x4_M2_L0
++.align 5
++.L_dgemm_2x4_M2_L71:
++    xvldrepl.d     U0,   A0,    0x00
++    xvldrepl.d     U1,   A0,    0x08
++
++    xvld    U4,  B0, 0x00
++
++    xvfmadd.d   D0,     U0,     U4,    D0
++    xvfmadd.d   D1,     U1,     U4,    D1
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x20
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_2x4_M2_L71
++.L_dgemm_2x4_M2_L0:
++    xvpackev.d  D4,     D1,     D0
++    xvpackod.d  D5,     D1,     D0
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++
++    xvpermi.q   U0, U2, 0x02
++    xvpermi.q   U1, U3, 0x02
++
++    GSUB xvf, d, U0, U0, D4, U1, U1, D5
++.endm
++
++.macro dgemm_4x4
++.L_dgemm_4x4:
++    /* Load 4 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++
++    xvldrepl.d     U6, B0, 0x10
++    /* line 3 */
++    xvfmul.d  D8,  U0, U6
++
++    xvldrepl.d     U7, B0, 0x18
++    /* line 4 */
++    xvfmul.d  D12, U0, U7
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x20
++    PTR_ADDI    B0,  B0, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M4_L7 */
++    beq       ZERO,TL, .L_dgemm_4x4_M4_L7
++
++    xvld     U8,   A0,    0x00
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x20
++
++    beq    ZERO,    TL,  .L_dgemm_4x4_M4_TL1_END
++.align 5
++.L_dgemm_4x4_M4_TL1: /* TL-- */
++    KERNEL8x4x4
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_4x4_M4_TL1
++.L_dgemm_4x4_M4_TL1_END:
++    KERNEL8x4x4_END
++.L_dgemm_4x4_M4_L7:
++    /* if (!(L & 7)) goto L_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_4x4_M4_L0
++.align 5
++.L_dgemm_4x4_M4_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    xvldrepl.d U4,  B0, 0x08
++    xvfmadd.d  D4,  U0, U4, D4
++
++    xvldrepl.d U4,  B0, 0x10
++    xvfmadd.d  D8,  U0, U4, D8
++
++    xvldrepl.d U4,  B0, 0x18
++    xvfmadd.d  D12, U0, U4, D12
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x20
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_4x4_M4_L71
++ .L_dgemm_4x4_M4_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    /* Load C2  */
++    xvld      U2,  C2,  0x00
++    /* Load C3  */
++    xvld      U3,  C3,  0x00
++
++    GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
++.endm
++
++.macro dgemm_8x4
++.L_dgemm_8x4:
++    /* Load 8 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++
++    xvldrepl.d     U6, B0, 0x10
++    /* line 3 */
++    xvfmul.d  D8,  U0, U6
++    xvfmul.d  D9,  U1, U6
++
++    xvldrepl.d     U7, B0, 0x18
++    /* line 4 */
++    xvfmul.d  D12, U0, U7
++    xvfmul.d  D13, U1, U7
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x40
++    PTR_ADDI    B0,  B0, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M8_L7 */
++    beq       ZERO,TL, .L_dgemm_8x4_M8_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    xvldrepl.d  U14,   B0,  0x10
++    xvldrepl.d  U15,   B0,  0x18
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x20
++
++    beq    ZERO,    TL,  .L_dgemm_8x4_M8_TL1_END
++.align 5
++.L_dgemm_8x4_M8_TL1: /* TL-- */
++    KERNEL8x8x4
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_8x4_M8_TL1
++
++.L_dgemm_8x4_M8_TL1_END:
++    KERNEL8x8x4_END
++
++.L_dgemm_8x4_M8_L7:
++    /* if (!(L & 7)) goto L_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_8x4_M8_L0
++.align 5
++.L_dgemm_8x4_M8_L71:
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    xvldrepl.d U6,  B0, 0x10
++    xvfmadd.d  D8,  U0, U6, D8
++    xvfmadd.d  D9,  U1, U6, D9
++
++    xvldrepl.d U7,  B0, 0x18
++    xvfmadd.d  D12, U0, U7, D12
++    xvfmadd.d  D13, U1, U7, D13
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x20
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_8x4_M8_L71
++.L_dgemm_8x4_M8_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++
++    /* Load C2  */
++    xvld      U4,  C2,  0x00
++    xvld      U5,  C2,  0x20
++
++    /* Load C3  */
++    xvld      U6,  C3,  0x00
++    xvld      U7,  C3,  0x20
++
++    GSUB xvf, d, U0, U0, D0,  U1, U1, D1, \
++                 U2, U2, D4,  U3, U3, D5, \
++                 U4, U4, D8,  U5, U5, D9, \
++                 U6, U6, D12, U7, U7, D13
++.endm
++
++.macro dgemm_4x2
++.L_dgemm_4x2:
++    /* Load 4 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x20
++    PTR_ADDI    B0,  B0, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
++    beq       ZERO,TL, .L_dgemm_4x2_N3_M4_L7
++
++    xvld     U8,   A0,    0x00
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x10
++
++    beq    ZERO,    TL,  .L_dgemm_4x2_N3_M4_TL1_END
++.align 5
++.L_dgemm_4x2_N3_M4_TL1: /* TL-- */
++    KERNEL8x4x2
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
++.L_dgemm_4x2_N3_M4_TL1_END:
++    KERNEL8x4x2_END
++
++.L_dgemm_4x2_N3_M4_L7:
++    /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_4x2_N3_M4_L0
++.align 5
++.L_dgemm_4x2_N3_M4_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x10
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_4x2_N3_M4_L71
++
++.L_dgemm_4x2_N3_M4_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    GSUB xvf, d, U0, U0, D0, U1, U1, D4
++.endm
++
++.macro dgemm_2x2
++.L_dgemm_2x2:
++    /* Load 2 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    xvldrepl.d     U4, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x10
++    PTR_ADDI    B0,  B0, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
++    beq       ZERO,TL, .L_dgemm_2x2_N3_M2_L7
++
++    xvld     U8,   A0,    0x00
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x10
++
++    beq    ZERO,    TL,  .L_dgemm_2x2_N3_M2_TL1_END
++.align 5
++.L_dgemm_2x2_N3_M2_TL1: /* TL-- */
++    KERNEL8x2x2
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
++.L_dgemm_2x2_N3_M2_TL1_END:
++    KERNEL8x2x2_END
++
++.L_dgemm_2x2_N3_M2_L7:
++    /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_2x2_N3_M2_L0
++.align 5
++.L_dgemm_2x2_N3_M2_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x10
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_2x2_N3_M2_L71
++.L_dgemm_2x2_N3_M2_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    /* Load C1  */
++    xvld      U1,  C1,  0x00
++    GSUB xvf, d, U0, U0, D0, U1, U1, D4
++.endm
++
++.macro dgemm_8x2
++.L_dgemm_8x2:
++    /* Load 8 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x40
++    PTR_ADDI    B0,  B0, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
++    beq       ZERO,TL, .L_dgemm_8x2_N3_M8_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x10
++
++    beq    ZERO,    TL,  .L_dgemm_8x2_N3_M8_TL1_END
++.align 5
++.L_dgemm_8x2_N3_M8_TL1: /* TL-- */
++    KERNEL8x8x2
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
++.L_dgemm_8x2_N3_M8_TL1_END:
++    KERNEL8x8x2_END
++
++.L_dgemm_8x2_N3_M8_L7:
++    /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_8x2_N3_M8_L0
++.align 5
++.L_dgemm_8x2_N3_M8_L71:
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x10
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_8x2_N3_M8_L71
++
++.L_dgemm_8x2_N3_M8_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    /* Load C1  */
++    xvld      U2,  C1,  0x00
++    xvld      U3,  C1,  0x20
++    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
++.endm
++
++.macro dgemm_16x2
++.L_dgemm_16x2:
++    /* Load 16 * 64 from A0
++     * U0 = {a3,  a2,  a1,  a0}
++     * U1 = {a7,  a6,  a5,  a4}
++     * U2 = {a11, a10, a9,  a8}
++     * U3 = {a15, a14, a13, a12}
++     */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++    xvfmul.d  D2,  U2, U4
++    xvfmul.d  D3,  U3, U4
++
++    xvldrepl.d     U5, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U5
++    xvfmul.d  D5,  U1, U5
++    xvfmul.d  D6,  U2, U5
++    xvfmul.d  D7,  U3, U5
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x80
++    PTR_ADDI    B0,  B0, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N3_L7 */
++    beq       ZERO,TL, .L_dgemm_16x2_N3_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x10
++
++    beq    ZERO,    TL,  .L_dgemm_16x2_N3_TL1_END
++.align 5
++.L_dgemm_16x2_N3_TL1: /* TL-- */
++    KERNEL8x16x2
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_16x2_N3_TL1
++.L_dgemm_16x2_N3_TL1_END:
++    KERNEL8x16x2_END
++
++.L_dgemm_16x2_N3_L7:
++    /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_16x2_N3_L0
++.align 5
++.L_dgemm_16x2_N3_L71:
++    /* Load 16 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++    xvfmadd.d  D5,  U1, U5, D5
++    xvfmadd.d  D6,  U2, U5, D6
++    xvfmadd.d  D7,  U3, U5, D7
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x10
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_16x2_N3_L71
++
++.L_dgemm_16x2_N3_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    /* Load C1  */
++    xvld      U4,  C1,  0x00
++    xvld      U5,  C1,  0x20
++    xvld      U6,  C1,  0x40
++    xvld      U7,  C1,  0x60
++    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
++                 U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
++.endm
++
++.macro dgemm_2x1
++.L_dgemm_2x1:
++    /* Load 2 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x10
++    PTR_ADDI    B0,  B0, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
++    beq       ZERO,TL, .L_dgemm_2x1_N1_M2_L7
++
++    xvld     U8,   A0,    0x00
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x08
++
++    beq    ZERO,    TL,  .L_dgemm_2x1_N1_M2_TL1_END
++.align 5
++.L_dgemm_2x1_N1_M2_TL1: /* TL-- */
++    KERNEL8x2x1
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
++.L_dgemm_2x1_N1_M2_TL1_END:
++    KERNEL8x2x1_END
++
++.L_dgemm_2x1_N1_M2_L7:
++    /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_2x1_N1_M2_L0
++.align 5
++.L_dgemm_2x1_N1_M2_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x10
++    PTR_ADDI     B0,  B0, 0x08
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_2x1_N1_M2_L71
++.L_dgemm_2x1_N1_M2_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    GSUB xvf, d, U0, U0, D0
++.endm
++
++.macro dgemm_4x1
++.L_dgemm_4x1:
++    /* Load 4 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x20
++    PTR_ADDI    B0,  B0, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
++    beq       ZERO,TL, .L_dgemm_4x1_N1_M4_L7
++
++    xvld     U8,   A0,    0x00
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x08
++
++    beq    ZERO,    TL,  .L_dgemm_4x1_N1_M4_TL1_END
++.align 5
++.L_dgemm_4x1_N1_M4_TL1: /* TL-- */
++    KERNEL8x4x1
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
++.L_dgemm_4x1_N1_M4_TL1_END:
++    KERNEL8x4x1_END
++
++.L_dgemm_4x1_N1_M4_L7:
++    /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_4x1_N1_M4_L0
++.align 5
++.L_dgemm_4x1_N1_M4_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x20
++    PTR_ADDI     B0,  B0, 0x08
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_4x1_N1_M4_L71
++.L_dgemm_4x1_N1_M4_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    GSUB xvf, d, U0, U0, D0
++.endm
++
++.macro dgemm_8x1
++.L_dgemm_8x1:
++    /* Load 8 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x40
++    PTR_ADDI    B0,  B0, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
++    beq       ZERO,TL, .L_dgemm_8x1_N1_M8_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x08
++
++    beq    ZERO,    TL,  .L_dgemm_8x1_N1_M8_TL1_END
++.align 5
++.L_dgemm_8x1_N1_M8_TL1: /* TL-- */
++    KERNEL8x8x1
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
++
++.L_dgemm_8x1_N1_M8_TL1_END:
++    KERNEL8x8x1_END
++
++.L_dgemm_8x1_N1_M8_L7:
++    /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_8x1_N1_M8_L0
++.align 5
++.L_dgemm_8x1_N1_M8_L71:
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x40
++    PTR_ADDI     B0,  B0, 0x08
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_8x1_N1_M8_L71
++.L_dgemm_8x1_N1_M8_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    GSUB xvf, d, U0, U0, D0, U1, U1, D1
++.endm
++
++.macro dgemm_16x1
++.L_dgemm_16x1:
++    /* Load 16 * 64 from A0
++     * U0 = {a3,  a2,  a1,  a0}
++     * U1 = {a7,  a6,  a5,  a4}
++     * U2 = {a11, a10, a9,  a8}
++     * U3 = {a15, a14, a13, a12}
++     */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++    xvfmul.d  D1,  U1, U4
++    xvfmul.d  D2,  U2, U4
++    xvfmul.d  D3,  U3, U4
++
++    /* Add stride for A0 and B0 */
++    PTR_ADDI    A0,  A0, 0x80
++    PTR_ADDI    B0,  B0, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
++    beq       ZERO,TL, .L_dgemm_16x1_N1_L7
++
++    xvld     U8,   A0,    0x00
++    xvld     U9,   A0,    0x20
++    xvld     U10,  A0,    0x40
++    xvld     U11,  A0,    0x60
++
++    PTR_ADDI    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x08
++
++    beq    ZERO,    TL,  .L_dgemm_16x1_N1_TL1_END
++.align 5
++.L_dgemm_16x1_N1_TL1: /* TL-- */
++    KERNEL8x16x1
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_16x1_N1_TL1
++.L_dgemm_16x1_N1_TL1_END:
++    KERNEL8x16x1_END
++
++.L_dgemm_16x1_N1_L7:
++    /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_16x1_N1_L0
++.align 5
++.L_dgemm_16x1_N1_L71:
++    /* Load 16 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++    xvld     U1,   A0,    0x20
++    xvld     U2,   A0,    0x40
++    xvld     U3,   A0,    0x60
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++    xvfmadd.d  D1,  U1, U4, D1
++    xvfmadd.d  D2,  U2, U4, D2
++    xvfmadd.d  D3,  U3, U4, D3
++
++    /* Add stride for A0, B0 */
++    PTR_ADDI     A0,  A0, 0x80
++    PTR_ADDI     B0,  B0, 0x08
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_16x1_N1_L71
++.L_dgemm_16x1_N1_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C0,  0x20
++    xvld      U2,  C0,  0x40
++    xvld      U3,  C0,  0x60
++    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
++.endm
++
++.macro dgemm_1x2
++.L_dgemm_1x2: // See dgemm_kernel_16x4.S
++    /* Load 1 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    xvldrepl.d     U4, B0, 0x08
++    /* line 2 */
++    xvfmul.d  D4,  U0, U4
++
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x08
++    addi.d    B0,  B0, 0x10
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N3_M1_L7 */
++    beq       ZERO,TL, .L_dgemm_1x2_N3_M1_L7
++
++    xvld     U8,   A0,    0x00
++
++    addi.d    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    xvldrepl.d  U13,   B0,  0x08
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++    beq    ZERO,    TL,  .L_dgemm_1x2_N3_M1_TL1_END
++.L_dgemm_1x2_N3_M1_TL1: /* TL-- */
++    KERNEL8x1x2
++    addi.d    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
++.L_dgemm_1x2_N3_M1_TL1_END:
++    KERNEL8x1x2_END
++.L_dgemm_1x2_N3_M1_L7:
++    /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_dgemm_1x2_N3_M1_L0
++.L_dgemm_1x2_N3_M1_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    xvldrepl.d U5,  B0, 0x08
++    xvfmadd.d  D4,  U0, U5, D4
++
++    /* Add stride for A0, B0 */
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x10
++
++    addi.d     TL,  TL, -1
++    blt        ZERO,TL, .L_dgemm_1x2_N3_M1_L71
++.L_dgemm_1x2_N3_M1_L0:
++    xvld      U0,  C0,  0x00
++    xvld      U1,  C1,  0x00
++    xvinsve0.d  U0,     U1,     0x01
++    xvinsve0.d  D0,     D4,     0x01
++    GSUB xvf, d, U0, U0, D0
++.endm
++
++.macro dgemm_1x1
++.L_dgemm_1x1:
++    /* Load 1 * 64 from A0 */
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d     U4, B0, 0x00
++    /* line 1 */
++    xvfmul.d  D0,  U0, U4
++
++    /* Add stride for A0 and B0 */
++    addi.d    A0,  A0, 0x08
++    addi.d    B0,  B0, 0x08
++    /* Reduce L */
++    addi.d    L,   L,  -1
++    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M1_L7 */
++    beq       ZERO,TL, .L_N1_M1_L7
++
++    xvld     U8,   A0,    0x00
++
++    addi.d    TL,  TL,  -1
++
++    xvldrepl.d  U12,   B0,  0x00
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++
++    beq    ZERO,    TL,  .L_N1_M1_TL1_END
++.L_N1_M1_TL1: /* TL-- */
++    KERNEL8x1x1
++    addi.d    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M1_TL1
++.L_N1_M1_TL1_END:
++    KERNEL8x1x1_END
++.L_N1_M1_L7:
++    /* if (!(L & 7)) goto L_N1_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M1_L0
++
++.L_N1_M1_L71:
++    xvld     U0,   A0,    0x00
++
++    xvldrepl.d U4,  B0, 0x00
++    xvfmadd.d  D0,  U0, U4, D0
++
++    /* Add stride for A0, B0 */
++    addi.d     A0,  A0, 0x08
++    addi.d     B0,  B0, 0x08
++
++    addi.d     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M1_L71
++.L_N1_M1_L0:
++    /* Load C0  */
++    xvld      U0,  C0,  0x00
++    GSUB xvf, d, U0, U0, D0
++.endm
+diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
+new file mode 100644
+index 000000000..694dcdaa9
+--- /dev/null
++++ b/kernel/loongarch64/loongarch64_asm.S
+@@ -0,0 +1,430 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++
++#if __loongarch_grlen == 64
++#define LA_REG    int64_t
++#define REG_SIZE  8
++#define REG_LOG   3
++#define PTR_ADDI  addi.d
++#define PTR_ADD   add.d
++#define PTR_SUB   sub.d
++#define PTR_LD    ld.d
++#define PTR_ST    st.d
++#define PTR_SLLI  slli.d
++#define PTR_SRLI  srli.d
++#define PTR_SRAI  srai.d
++#define PTR_MUL   mul.d
++#define PTR_ALSL  alsl.d
++#elif __loongarch_grlen == 32
++#define LA_REG    int32_t
++#define REG_SIZE  4
++#define REG_LOG   2
++#define PTR_ADDI  addi.w
++#define PTR_ADD   add.w
++#define PTR_SUB   sub.w
++#define PTR_LD    ld.w
++#define PTR_ST    st.w
++#define PTR_SLLI  slli.w
++#define PTR_SRLI  srli.w
++#define PTR_SRAI  srai.w
++#define PTR_MUL   mul.w
++#define PTR_ALSL  alsl.w
++#else
++// If neither of the above two conditions is supported, it means this is an early
++// internal toolchain. To ensure maximum compatibility, the following approach is taken:
++#define LA_REG    int64_t
++#define REG_SIZE  8
++#define REG_LOG   3
++#define PTR_ADDI  addi.d
++#define PTR_ADD   add.d
++#define PTR_SUB   sub.d
++#define PTR_LD    ld.d
++#define PTR_ST    st.d
++#define PTR_SLLI  slli.d
++#define PTR_SRLI  srli.d
++#define PTR_SRAI  srai.d
++#define PTR_MUL   mul.d
++#define PTR_ALSL  alsl.d
++#endif
++
++#if __loongarch_frlen == 64
++#define FREG_SIZE 8
++#define FREG_LOG  3
++#define PTR_FLD   fld.d
++#define PTR_FST   fst.d
++#elif __loongarch_frlen == 32
++#define FREG_SIZE 4
++#define FREG_LOG  2
++#define PTR_FLD   fld.s
++#define PTR_FST   fst.s
++#else
++// If neither of the above two conditions is supported, it means this is an early
++// internal toolchain. To ensure maximum compatibility, the following approach is taken:
++#define FREG_SIZE 8
++#define FREG_LOG  3
++#define PTR_FLD   fld.d
++#define PTR_FST   fst.d
++#endif
++
++// The max registers available to the user which
++// do not need to be preserved across calls.
++// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
++#define MAX_INT_CALLER_SAVED 17
++#define MAX_FP_CALLER_SAVED  24
++
++.altmacro // Enable alternate macro mode
++
++.macro push_if_used regs, fregs
++.if \regs > MAX_INT_CALLER_SAVED
++    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
++    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
++.endif
++.if \fregs > MAX_FP_CALLER_SAVED
++    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
++    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
++.endif
++.endm // End push_if_used
++.macro pop_if_used regs, fregs
++.if \fregs > MAX_FP_CALLER_SAVED
++    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
++    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
++.endif
++.if \regs > MAX_INT_CALLER_SAVED
++    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
++    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
++.endif
++.endm // End pop_if_used
++.macro push_regs from, to
++    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
++.if \to - \from
++    push_regs %from + 1, \to
++.endif
++.endm // End push_regs
++.macro pop_regs from, to
++    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
++.if \to - \from
++    pop_regs %from + 1, \to
++.endif
++.endm // End pop_regs
++.macro push_fregs from, to
++    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
++.if \to - \from
++    push_fregs %from + 1, \to
++.endif
++.endm // End push_fregs
++.macro pop_fregs from, to
++    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
++.if \to - \from
++    pop_fregs %from + 1, \to
++.endif
++.endm // End pop_fregs
++
++//
++// Instruction Related Macros
++//
++// GLD
++//
++.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
++.ifeqs "\suf_op", "0"
++    \pre_op\()ld    \out,   \src,   \offset
++.else
++    \pre_op\()ld.\suf_op    \out,   \src,   \offset
++.endif
++.ifnb \more
++    GLD \pre_op, \suf_op, \more
++.endif
++.endm
++
++//
++// GLD_INC
++//
++.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
++.ifeqs "\suf_op", "0"
++    \pre_op\()ld    \out,   \src,   \offset
++.else
++    \pre_op\()ld.\suf_op    \out,   \src,   \offset
++.endif
++    PTR_ADDI  \src,   \src,   \inc
++.ifnb \more
++    GLD_INC \pre_op, \suf_op, \inc, \more
++.endif
++.endm
++//
++// GLDX is same as GLD except the stride is a register
++//
++.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
++.ifeqs "\suf_op", "0"
++    \pre_op\()ldx    \out,   \src,   \offset
++.else
++    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
++.endif
++.ifnb \more
++    GLDX \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GLDREPL
++//
++.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
++    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
++.ifnb \more
++    GLDREPL  \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GST
++//
++.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
++.ifeqs "\suf_op", "0"
++    \pre_op\()st    \src,   \dst,   \offset
++.else
++    \pre_op\()st.\suf_op \src,  \dst, \offset
++.endif
++.ifnb \more
++    GST \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GMUL
++//
++.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
++.ifnb \more
++    GMUL \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GMADD
++//
++.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
++    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
++.ifnb \more
++    GMADD \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GADD
++//
++.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()add.\suf_op \out, \in0, \in1
++.ifnb \more
++    GADD \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GADDI
++//
++.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
++.ifnb \more
++    GADDI \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GSUB
++//
++.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()sub.\suf_op \out, \in0, \in1
++.ifnb \more
++    GSUB \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GSLLI
++//
++.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
++.ifnb \more
++    GSLLI \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GINSVE0
++//
++.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
++.ifnb \more
++    GINSVE0 \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GXOR
++//
++.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
++.ifnb \more
++    GXOR \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GPERMI
++//
++.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
++    \pre_op\()permi.\suf_op     \out,   \in0,   \in1
++.ifnb \more
++    GPERMI \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GNMSUB
++//
++.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
++    \pre_op\()nmsub.\suf_op     \out,   \in0,   \in1,   \in2
++.ifnb \more
++    GNMSUB \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GPRELD
++//
++.macro GPRELD in0:req, in1:req, in2:req, more:vararg
++    preld \in0, \in1, \in2
++.ifnb \more
++    GPRELD \more
++.endif
++.endm
++
++//
++// Compound instructions
++//
++// GACC: Accumulate the values of vector registers
++//
++.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
++.ifeqs "\pre_op", "xvf"
++    xvpermi.q              \out,   \in,    0x01
++    \pre_op\()add.\suf_op  \in,    \out,   \in
++    xvpackod.d             \out,   \in,    \in
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifeqs "\suf_op", "s"
++    xvpackod.w             \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.endif
++.endif
++
++.ifeqs "\pre_op", "vf"
++    vpackod.d              \out,   \in,    \in
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifeqs "\suf_op", "s"
++    vpackod.w              \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.endif
++.endif
++
++.ifeqs "\pre_op", "xv"
++    xvpermi.q              \out,   \in,    0x01
++    \pre_op\()add.\suf_op  \in,    \out,   \in
++    xvpackod.d             \out,   \in,    \in
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "d"
++    xvpackod.w             \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "w"
++    xvpackod.h             \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "h"
++    xvpackod.b             \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.endif
++.endif
++.endif
++.endif
++
++.ifeqs "\pre_op", "v"
++    vpackod.d              \out,   \in,    \in
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "d"
++    vpackod.w              \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "w"
++    vpackod.h              \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.ifnc "\suf_op", "h"
++    vpackod.b              \in,    \out,   \out
++    \pre_op\()add.\suf_op  \out,   \out,   \in
++.endif
++.endif
++.endif
++.endif
++
++.ifnb \more
++    GACC \pre_op, \suf_op, \more
++.endif
++.endm
++//
++// GMOV
++//
++.macro GMOV pre_op:req, out:req, in:req, more:vararg
++    \pre_op\()or.v  \out, \in, \in
++.ifnb \more
++    GMOV \pre_op, \more
++.endif
++.endm
++
++//
++// Media Related Macros
++//
++.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
++    \pre_op\()ilvl.\suf_op \out0, \in0, \in1
++    \pre_op\()ilvh.\suf_op \out1, \in0, \in1
++.endm
++.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
++    \pre_op\()pickev.\suf_op \out0, \in0, \in1
++    \pre_op\()pickod.\suf_op \out1, \in0, \in1
++.endm
++
++//
++// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
++// has no pre_op param. 128-bit vector instructions are not supported.
++//
++.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
++                       vt0, vt1
++    GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
++    GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
++    GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
++    GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
++.endm
++
++.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
++                       in0, in1, in2, in3, in4, in5, in6, in7, \
++                       tmp0, tmp1, tmp2, tmp3
++    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
++    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
++    GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
++    GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
++
++    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
++    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
++    GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
++    GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
++
++    GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
++
++    GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
++                  \out2, \out6, 0x02, \out3, \out7, 0x02, \
++                  \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
++                  \out6, \tmp2, 0x31, \out7, \tmp3, 0x31
++.endm
+diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
+new file mode 100644
+index 000000000..bd609394e
+--- /dev/null
++++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
+@@ -0,0 +1,2348 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/23 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++* 2023/08/23 guxiwei
++* Parameter:
++*       SGEMM_DEFAULT_UNROLL_N  8
++*       SGEMM_DEFAULT_UNROLL_M  16
++*       SGEMM_DEFAULT_P         256
++*       SGEMM_DEFAULT_Q         256
++*       SGEMM_DEFAULT_R         1024
++*       A_PRE                   1024
++*       B_PRE                   256    // Enable prefetching for B results in a performance decrease, temporarily disabled.
++*
++*
++* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
++*       1 thread:       71.7 GFLOPS
++*       2 threads:     142.6 GFLOPS
++*       3 threads:     211.5 GFLOPS
++*       4 threads:     265.0 GFLOPS
++*********************************************************************/
++
++/* Function parameters */
++#define M      $r4   // param 1: bm
++#define N      $r5   // param 2: bn
++#define K      $r6   // param 3: bk
++#define ALPHA  $f0   // param 4: alpha
++#define A      $r7   // param 5: ba
++#define B      $r8   // param 6: bb
++#define C      $r9   // param 7: bc
++#define LDC    $r10  // param 8: ldc
++
++#ifdef TRMMKERNEL
++#define OFFSET $r11  // param 9: offset
++#endif
++#define OFF    $r12
++
++/* Cycle control parameters */
++#define I      $r13
++#define J      $r14
++#define L      $r15
++#define TL     $r16
++/* Matrix address */
++#define A0     $r17
++#define B0     $r18
++#define C0     $r19
++#define C1     $r20
++#define C2     $r23
++#define C3     $r24
++#define C4     $r25
++#define C5     $r26
++#define C6     $r27
++#define C7     $r28
++#define T0     $r29
++#define T1     $r30
++#undef  ZERO
++#define ZERO   $r0
++
++/* LASX Vectors
++ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
++ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
++ * Use D0 to D15 to store intermediate values of the computation.
++ * Use VALPHA to store the broadcast value of alpha
++ */
++#define U0     $xr0
++#define U1     $xr1
++#define X0     $xr2
++#define X1     $xr3
++#define X2     $xr4
++#define X3     $xr5
++#define X4     $xr6
++#define X5     $xr7
++#define X6     $xr8
++#define X7     $xr9
++#define D0     $xr10
++#define D1     $xr11
++#define D2     $xr12
++#define D3     $xr13
++#define D4     $xr14
++#define D5     $xr15
++#define D6     $xr16
++#define D7     $xr17
++#define D8     $xr18
++#define D9     $xr19
++#define D10    $xr20
++#define D11    $xr21
++#define D12    $xr22
++#define D13    $xr23
++#define D14    $xr24
++#define D15    $xr25
++#define VALPHA $xr26
++
++/* Prefetch interval */
++#define A_PRE  0x400
++#define B_PRE  0x100
++
++// Loops outline:
++// .L_N8  <--------------------------------------------------------------------------------------------    /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
++// |    .L_M16 <---------------------                                                                  |   /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
++// |    |  .L_M16_TL1               |                                                                  |
++// |    |  .L_M16_L7                | The entire core loop of the function, KERNEK16x8                 |
++// |    |  .L_M16_L71               |                                                                  |
++// |    |  .L_M16_L0 ----------------                                                                  |
++// |    .L_M8                                                                                          |
++// |    |  .L_M8_TL1                |                                                                  |
++// |    |  .L_M8_L7                 | KERNEK8x8                                                        |
++// |    |  .L_M8_L71                |                                                                  |
++// |    |  .L_M8_L0                 |                                                                  |
++// |    .L_M4                                                                                          |
++// |    |  .L_M4_TL1                |                                                                  |
++// |    |  .L_M4_L7                 | KERNEK4x8                                                        |
++// |    |  .L_M4_L71                |                                                                  |
++// |    |  .L_M4_L0                 |                                                                  |
++// |    .L_M2                                                                                          |
++// |    |  .L_M2_TL1                |                                                                  |
++// |    |  .L_M2_L7                 | KERNEK2x8                                                        |
++// |    |  .L_M2_L71                |                                                                  |
++// |    |  .L_M2_L0                 |                                                                  |
++// |    .L_M1                                                                                          |
++// |    |  .L_M1_TL1                |                                                                  |
++// |    |  .L_M1_L7                 | KERNEK1x8                                                        |
++// |    |  .L_M1_L71                |                                                                  |
++// |    |  .L_M1_L0                 |                                                                  |
++// |    .L_M0------------------------------------------------------------------------------------------
++// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
++// .L_N4
++// |    .L_N4_M16 <---------------------
++// |    |  .L_N4_M16_TL1               |
++// |    |  .L_N4_M16_L7                |  KERNEL16x4
++// |    |  .L_N4_M16_L71               |
++// |    |  .L_N4_M16_L0 ----------------
++// |    .L_N4_M8
++// |    |  .L_N4_M8_TL1                |
++// |    |  .L_N4_M8_L7                 | KERNEL8x4
++// |    |  .L_N4_M8_L71                |
++// |    |  .L_N4_M8_L0                 |
++// |    .L_N4_M4
++// |    |  .L_N4_M4_TL1                |
++// |    |  .L_N4_M4_L7                 | KERNEL4x4
++// |    |  .L_N4_M4_L71                |
++// |    |  .L_N4_M4_L0                 |
++// |    .L_N4_M2
++// |    |  .L_N4_M2_TL1                |
++// |    |  .L_N4_M2_L7                 | KERNEL2x4
++// |    |  .L_N4_M2_L71                |
++// |    |  .L_N4_M2_L0                 |
++// |    .L_N4_M1
++// |    |  .L_N4_M1_TL1                |
++// |    |  .L_N4_M1_L7                 | KERNEL1x4
++// |    |  .L_N4_M1_L71                |
++// |    |  .L_N4_M1_L0                 |
++// |    .L_N4_M0
++// .L_N3     /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
++// .L_N2
++// |    .L_N2_M16 <---------------------
++// |    |  .L_N2_M16_TL1               |
++// |    |  .L_N2_M16_L7                | KERNEL16x2
++// |    |  .L_N2_M16_L71               |
++// |    |  .L_N2_M16_L0 ----------------
++// |    .L_N2_M8
++// |    |  .L_N2_M8_TL1                 |
++// |    |  .L_N2_M8_L7                  | KERNEL8x2
++// |    |  .L_N2_M8_L71                 |
++// |    |  .L_N2_M8_L0                  |
++// |    .L_N2_M4
++// |    |  .L_N2_M4_TL1                 |
++// |    |  .L_N2_M4_L7                  | KERNEL4x2
++// |    |  .L_N2_M4_L71                 |
++// |    |  .L_N2_M4_L0                  |
++// |    .L_N2_M2
++// |    |  .L_N2_M2_TL1                 |
++// |    |  .L_N2_M2_L7                  | KERNEL2x2
++// |    |  .L_N2_M2_L71                 |
++// |    |  .L_N2_M2_L0                  |
++// |    .L_N2_M1
++// |    |  .L_N2_M1_TL1                 |
++// |    |  .L_N2_M1_L7                  | KERNEL1x2
++// |    |  .L_N2_M1_L71                 |
++// |    |  .L_N2_M1_L0                  |
++// |    .L_N2_M0
++// .L_N1
++// |    .L_N1_M16 <---------------------
++// |    |  .L_N1_M16_TL1               |
++// |    |  .L_N1_M16_L7                | KERNEL16x1
++// |    |  .L_N1_M16_L71               |
++// |    |  .L_N1_M16_L0 ----------------
++// |    .L_N1_M8
++// |    |  .L_N1_M8_TL1                |
++// |    |  .L_N1_M8_L7                 | KERNEL8x1
++// |    |  .L_N1_M8_L71                |
++// |    |  .L_N1_M8_L0                 |
++// |    .L_N1_M4
++// |    |  .L_N1_M4_TL1                |
++// |    |  .L_N1_M4_L7                 | KERNEL4x1
++// |    |  .L_N1_M4_L71                |
++// |    |  .L_N1_M4_L0                 |
++// |    .L_N1_M2
++// |    |  .L_N1_M2_TL1                |
++// |    |  .L_N1_M2_L7                 | KERNEL2x1
++// |    |  .L_N1_M2_L71                |
++// |    |  .L_N1_M2_L0                 |
++// |    .L_N1_M1
++// |    |  .L_N1_M1_TL1                |
++// |    |  .L_N1_M1_L7                 | KERNEL1x1
++// |    |  .L_N1_M1_L71                |
++// |    |  .L_N1_M1_L0                 |
++// |    .L_N1_M0
++// .L_N0
++
++/*************** sgemm_kernel_macros ***************/
++.macro KERNEL1x16x8_START
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMUL xvf, s, D0, U0, X0, D1, U1, X0
++    preld   0,  C0, 0x00
++    GMUL xvf, s, D2, U0, X1, D3, U1, X1
++    preld   0,  C1, 0x00
++    GMUL xvf, s, D4, U0, X2, D5, U1, X2
++    preld   0,  C2, 0x00
++    GMUL xvf, s, D6, U0, X3, D7, U1, X3
++    preld   0,  C3, 0x00
++    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
++    GMUL xvf, s, D8,  U0, X4, D9,  U1, X4
++    preld   0,  C4, 0x00
++    GMUL xvf, s, D10, U0, X5, D11, U1, X5
++    preld   0,  C5, 0x00
++    GMUL xvf, s, D12, U0, X6, D13, U1, X6
++    preld   0,  C6, 0x00
++    GMUL xvf, s, D14, U0, X7, D15, U1, X7
++    preld   0,  C7, 0x00
++    PTR_ADDI   A0,     A0,     0x40
++    PTR_ADDI   B0,     B0,     0x20
++.endm
++
++.macro KERNEL1x16x8
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
++                  D2, U0, X1, D2, D3, U1, X1, D3
++    preld      0,   A0, A_PRE
++    GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
++                  D6, U0, X3, D6, D7, U1, X3  D7
++    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
++    GMADD xvf, s, D8,  U0, X4, D8,  D9,  U1, X4, D9,  \
++                  D10, U0, X5, D10, D11, U1, X5, D11
++    //preld      0,   B0, B_PRE
++    GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
++                  D14, U0, X7, D14, D15, U1, X7  D15
++    PTR_ADDI   A0,     A0,     0x40
++    PTR_ADDI   B0,     B0,     0x20
++.endm
++
++.macro KERNEL8x16x8
++.rept 8
++    KERNEL1x16x8
++.endr
++.endm
++
++.macro SAVE16x8
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
++                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA, \
++                 D8,  D8,  VALPHA, D9,  D9,  VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
++                 D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
++#else
++    /* Load C0  */
++    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
++    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
++    /* Load C1  */
++    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
++    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
++    /* Load C2  */
++    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
++    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
++    /* Load C3  */
++    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
++    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
++    /* Load C4  */
++    GLD xv, , X0, C4, 0x00, X1, C4, 0x20
++    GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
++    /* Load C5  */
++    GLD xv, , X2, C5, 0x00, X3, C5, 0x20
++    GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
++    /* Load C6  */
++    GLD xv, , X4, C6, 0x00, X5, C6, 0x20
++    GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
++    /* Load C7  */
++    GLD xv, , X6, C7, 0x00, X7, C7, 0x20
++    GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
++#endif // #if defined(TRMMKERNEL)
++    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
++              D2,  C1, 0x00, D3,  C1, 0x20, \
++              D4,  C2, 0x00, D5,  C2, 0x20, \
++              D6,  C3, 0x00, D7,  C3, 0x20, \
++              D8,  C4, 0x00, D9,  C4, 0x20, \
++              D10, C5, 0x00, D11, C5, 0x20, \
++              D12, C6, 0x00, D13, C6, 0x20, \
++              D14, C7, 0x00, D15, C7, 0x20
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
++               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
++               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
++#else
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
++               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
++#endif
++.endm
++
++// m = 8, 4, 2, 1
++// stride = 0x20, 0x10, 0x08, 0x04
++.macro KERNEL1xMx8_START m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
++                 D4, U0, X2, D6, U0, X3
++    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
++    GMUL xvf, s, D8,  U0, X4, D10, U0, X5, \
++                 D12, U0, X6, D14, U0, X7
++   PTR_ADDI   A0,     A0,     \stride
++   PTR_ADDI   B0,     B0,     0x20
++.endm
++
++.macro KERNEL1xMx8 m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
++                  D4, U0, X2, D4, D6, U0, X3, D6
++    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
++    GMADD xvf, s, D8,  U0, X4, D8,  D10, U0, X5, D10, \
++                  D12, U0, X6, D12, D14, U0, X7, D14
++   PTR_ADDI   A0,     A0,     \stride
++   PTR_ADDI   B0,     B0,     0x20
++.endm
++
++.macro KERNEL8xMx8 m, stride
++.rept 8
++    KERNEL1xMx8 \m, \stride
++.endr
++.endm
++
++.macro SAVEMx8 m, stride
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
++                 D4,  D4,  VALPHA, D6,  D6,  VALPHA, \
++                 D8,  D8,  VALPHA, D10, D10, VALPHA, \
++                 D12, D12, VALPHA, D14, D14, VALPHA
++#else
++    /* Load C0, C1, C2, C3, C4, C5, C6, C7 */
++ .if \m == 8
++    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
++ .elseif \m == 4
++    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
++.elseif \m == 2
++    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
++.elseif \m == 1
++    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
++ .endif
++    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
++                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
++.if \m == 8
++    GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
++.elseif \m == 4
++    GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
++.elseif \m == 2
++    GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
++.elseif \m == 1
++    GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
++.endif
++    GMADD xvf, s, D8,  D8,  VALPHA, X0, D10, D10, VALPHA, X2, \
++                  D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
++#endif // #if defined(TRMMKERNEL)
++.if \m == 8
++    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
++              D4,  C2, 0x00, D6,  C3, 0x00, \
++              D8,  C4, 0x00, D10, C5, 0x00, \
++              D12, C6, 0x00, D14, C7, 0x00
++.elseif \m == 4
++    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
++             $vr14, C2, 0x00, $vr16, C3, 0x00, \
++             $vr18, C4, 0x00, $vr20, C5, 0x00, \
++             $vr22, C6, 0x00, $vr24, C7, 0x00
++.elseif \m == 2
++    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
++              $f14, C2, 0x00, $f16, C3, 0x00, \
++              $f18, C4, 0x00, $f20, C5, 0x00, \
++              $f22, C6, 0x00, $f24, C7, 0x00
++.elseif \m == 1
++    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
++              $f14, C2, 0x00, $f16, C3, 0x00, \
++              $f18, C4, 0x00, $f20, C5, 0x00, \
++              $f22, C6, 0x00, $f24, C7, 0x00
++.endif
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
++               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
++               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
++#else
++    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
++               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
++#endif
++.endm
++
++.macro KERNEL1x16x4_START
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
++                 D2, U0, X1, D3, U1, X1, \
++                 D4, U0, X2, D5, U1, X2, \
++                 D6, U0, X3, D7, U1, X3
++   PTR_ADDI   A0,     A0,     0x40
++   PTR_ADDI   B0,     B0,     0x10
++.endm
++
++.macro KERNEL1x16x4
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
++                  D2, U0, X1, D2, D3, U1, X1, D3, \
++                  D4, U0, X2, D4, D5, U1, X2, D5, \
++                  D6, U0, X3, D6, D7, U1, X3  D7
++    PTR_ADDI   A0,     A0,     0x40
++    PTR_ADDI   B0,     B0,     0x10
++.endm
++
++.macro KERNEL8x16x4
++.rept 8
++    KERNEL1x16x4
++.endr
++.endm
++
++.macro SAVE16x4
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
++                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA
++#else
++    /* Load C0  */
++    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
++    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
++    /* Load C1  */
++    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
++    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
++    /* Load C2  */
++    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
++    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
++    /* Load C3  */
++    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
++    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
++#endif // #if defined(TRMMKERNEL)
++    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
++              D2,  C1, 0x00, D3,  C1, 0x20, \
++              D4,  C2, 0x00, D5,  C2, 0x20, \
++              D6,  C3, 0x00, D7,  C3, 0x20
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
++#else
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
++#endif
++.endm
++
++// m = 8, 4, 2, 1
++// stride = 0x20, 0x10, 0x08, 0x04
++.macro KERNEL1xMx4_START m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
++                 D4, U0, X2, D6, U0, X3
++   PTR_ADDI   A0,     A0,     \stride
++   PTR_ADDI   B0,     B0,     0x10
++.endm
++
++.macro KERNEL1xMx4 m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
++    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
++                  D4, U0, X2, D4, D6, U0, X3, D6
++   PTR_ADDI   A0,     A0,     \stride
++   PTR_ADDI   B0,     B0,     0x10
++.endm
++
++.macro KERNEL8xMx4 m, stride
++.rept 8
++    KERNEL1xMx4 \m, \stride
++.endr
++.endm
++
++.macro SAVEMx4 m, stride
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
++                 D4,  D4,  VALPHA, D6,  D6,  VALPHA
++#else
++    /* Load C0, C1, C2, C3 */
++ .if \m == 8
++    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
++ .elseif \m == 4
++    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
++.elseif \m == 2
++    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
++.elseif \m == 1
++    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
++ .endif
++    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
++                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
++#endif // #if defined(TRMMKERNEL)
++.if \m == 8
++    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
++              D4,  C2, 0x00, D6,  C3, 0x00
++.elseif \m == 4
++    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
++             $vr14, C2, 0x00, $vr16, C3, 0x00
++.elseif \m == 2
++    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
++              $f14, C2, 0x00, $f16, C3, 0x00
++.elseif \m == 1
++    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
++              $f14, C2, 0x00, $f16, C3, 0x00
++.endif
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
++#else
++    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
++#endif
++.endm
++
++.macro KERNEL1x16x2_START
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
++    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
++                 D2, U0, X1, D3, U1, X1
++   PTR_ADDI   A0,     A0,     0x40
++   PTR_ADDI   B0,     B0,     0x08
++.endm
++
++.macro KERNEL1x16x2
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
++    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
++                  D2, U0, X1, D2, D3, U1, X1, D3
++   PTR_ADDI   A0,     A0,     0x40
++   PTR_ADDI   B0,     B0,     0x08
++.endm
++
++.macro KERNEL8x16x2
++.rept 8
++    KERNEL1x16x2
++.endr
++.endm
++
++.macro SAVE16x2
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA
++#else
++    /* Load C0  */
++    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
++    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
++    /* Load C1  */
++    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
++    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
++#endif // #if defined(TRMMKERNEL)
++    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
++              D2,  C1, 0x00, D3,  C1, 0x20
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, 0x40, C1, C1, 0x40
++#else
++    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
++#endif
++.endm
++
++// m = 8, 4, 2, 1
++// stride = 0x20, 0x10, 0x08, 0x04
++.macro KERNEL1xMx2_START m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
++    GMUL xvf, s, D0, U0, X0, D2, U0, X1
++    PTR_ADDI   A0,     A0,     \stride
++    PTR_ADDI   B0,     B0,     0x08
++.endm
++
++.macro KERNEL1xMx2 m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
++    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
++    PTR_ADDI   A0,     A0,     \stride
++    PTR_ADDI   B0,     B0,     0x08
++.endm
++
++.macro KERNEL8xMx2 m, stride
++.rept 8
++    KERNEL1xMx2 \m, \stride
++.endr
++.endm
++
++.macro SAVEMx2 m, stride
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA
++#else
++    /* Load C0, C1 */
++ .if \m == 8
++    GLD xv, , X0, C0, 0x00, X2, C1, 0x00
++ .elseif \m == 4
++    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
++.elseif \m == 2
++    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
++.elseif \m == 1
++    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
++ .endif
++    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
++#endif // #if defined(TRMMKERNEL)
++.if \m == 8
++    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00
++.elseif \m == 4
++    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
++.elseif \m == 2
++    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
++.elseif \m == 1
++    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
++.endif
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, \stride, C1, C1, \stride
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, \stride, C1, C1, \stride
++#else
++    GADDI , d, C0, C0, \stride, C1, C1, \stride
++#endif
++.endm
++
++.macro KERNEL1x16x1_START
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++    GLDREPL xv, w, X0, B0, 0x00
++    GMUL xvf, s, D0, U0, X0, D1, U1, X0
++    PTR_ADDI   A0,     A0,     0x40
++    PTR_ADDI   B0,     B0,     0x04
++.endm
++
++.macro KERNEL1x16x1
++    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
++    GLDREPL xv, w, X0, B0, 0x00
++    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
++    PTR_ADDI   A0,     A0,     0x40
++    PTR_ADDI   B0,     B0,     0x04
++.endm
++
++.macro KERNEL8x16x1
++.rept 8
++    KERNEL1x16x1
++.endr
++.endm
++
++.macro SAVE16x1
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA
++#else
++    /* Load C0  */
++    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
++    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
++#endif // #if defined(TRMMKERNEL)
++    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, 0x40
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, 0x40
++#else
++    GADDI , d, C0, C0, 0x40
++#endif
++.endm
++
++// m = 8, 4, 2, 1
++// stride = 0x20, 0x10, 0x08, 0x04
++.macro KERNEL1xMx1_START m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00
++    GMUL xvf, s, D0, U0, X0
++    PTR_ADDI   A0,     A0,     \stride
++    PTR_ADDI   B0,     B0,     0x04
++.endm
++
++.macro KERNEL1xMx1 m, stride
++.if \m == 8
++    GLD xv, , U0, A0, 0x00
++.elseif \m == 4
++    GLD v, , $vr0, A0, 0x00
++.elseif \m ==2
++    GLD f, d, $f0, A0, 0x00
++.elseif \m ==1
++    GLD f, s, $f0, A0, 0x00
++.endif
++    GLDREPL xv, w, X0, B0, 0x00
++    GMADD xvf, s, D0, U0, X0, D0
++    PTR_ADDI   A0,     A0,     \stride
++    PTR_ADDI   B0,     B0,     0x04
++.endm
++
++.macro KERNEL8xMx1 m, stride
++.rept 8
++    KERNEL1xMx1 \m, \stride
++.endr
++.endm
++
++.macro SAVEMx1 m, stride
++#if defined(TRMMKERNEL)
++    GMUL xvf, s, D0,  D0,  VALPHA
++#else
++    /* Load C0, C1 */
++ .if \m == 8
++    GLD xv, , X0, C0, 0x00
++ .elseif \m == 4
++    GLD v, , $vr2, C0, 0x00
++.elseif \m == 2
++    GLD f, d, $f2, C0, 0x00
++.elseif \m == 1
++    GLD f, s, $f2, C0, 0x00
++ .endif
++    GMADD xvf, s, D0, D0, VALPHA, X0
++#endif // #if defined(TRMMKERNEL)
++.if \m == 8
++    GST xv, , D0,  C0, 0x00
++.elseif \m == 4
++    GST v, , $vr10, C0, 0x00
++.elseif \m == 2
++    GST f, d, $f10, C0, 0x00
++.elseif \m == 1
++    GST f, s, $f10, C0, 0x00
++.endif
++#if __loongarch_grlen == 64
++    GADDI , d, C0, C0, \stride
++#elif __loongarch_grlen == 32
++    GADDI , w, C0, C0, \stride
++#else
++    GADDI , d, C0, C0, \stride
++#endif
++.endm
++
++    PROLOGUE
++    push_if_used 26, 32
++    xvreplve0.w   VALPHA,     $xr0
++#if defined (TRMMKERNEL) && !defined(LEFT)
++    PTR_SUB   OFF,   ZERO,  OFFSET
++#else
++    xor     OFF,   OFF,   OFF
++#endif
++    /* if (!(N >> 3)) goto L_N7 */
++    PTR_SRAI   J,     N,     3     /* J = bn >> 3 */
++    andi     N,     N,     0x07
++    beq      ZERO,  J,     .L_N7
++.L_N8: /* J -- */
++    move     C0,    C
++    move     A0,    A
++    PTR_SLLI   T0,    LDC,   2
++    PTR_ADDI   J,     J,     -1   /* J-- */
++#if __loongarch_grlen == 64
++    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
++              C6, C5, T0, C7, C6, T0
++#elif __loongarch_grlen == 32
++    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
++              C6, C5, T0, C7, C6, T0
++#else
++    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
++              C6, C5, T0, C7, C6, T0
++#endif
++#if defined(TRMMKERNEL) && defined(LEFT)
++    move     OFF,   OFFSET
++#endif
++    /* if (!(M >> 4)) goto L_M8 */
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_M8
++.align 5
++.L_M16: /* I-- */
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x06
++    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
++#endif
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  16
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  8
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1x16x8_START
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M16_L7 */
++    beq       ZERO,TL, .L_M16_L7
++.align 5
++.L_M16_TL1:
++    KERNEL8x16x8
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_M16_TL1
++.L_M16_L7:
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M16_L0
++.align 5
++.L_M16_L71:
++    KERNEL1x16x8
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M16_L71
++.L_M16_L0:
++    SAVE16x8
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    /* number of values  in A */
++    PTR_ADDI    L,   L,   -16
++#else
++    /* number of values in B */
++    PTR_ADDI    L,   L,   -8
++#endif
++    PTR_SLLI    T0,  L,  0x06
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF, OFF, 0x10 /* number of values in A */
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++    PTR_ADDI    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_M16
++.L_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_M0
++
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_M4
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    A0,    A0,   T0 /* A0 += 8 * OFF */
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
++#endif
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  8
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  8
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif  // #if defined(TRMMKERNEL)
++    KERNEL1xMx8_START 8, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M8_L7 */
++    beq       ZERO,TL, .L_M8_L7
++.align 5
++.L_M8_TL1:
++    KERNEL8xMx8 8, 0x20
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_M8_TL1
++.L_M8_L7:
++    /* if (!(L & 7)) goto L_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M8_L0
++.align 5
++.L_M8_L71:
++    KERNEL1xMx8 8, 0x20
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M8_L71
++.L_M8_L0:
++    SAVEMx8 8, 0x20
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    L,   L,   -8
++#else
++    /* number of values in B */
++    PTR_ADDI    L,   L,   -8
++#endif
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    OFF,   OFF,  0x08
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_M2
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    A0,    A0,   T0 /* A0 += 4 * OFF */
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  4
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  8
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx8_START 4, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M4_L7 */
++    beq       ZERO,TL, .L_M4_L7
++.align 5
++.L_M4_TL1:
++    KERNEL8xMx8 4, 0x10
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M4_TL1
++.L_M4_L7:
++    /* if (!(L & 7)) goto L_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M4_L0
++.L_M4_L71:
++    KERNEL1xMx8 4, 0x10
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M4_L71
++.L_M4_L0:
++    SAVEMx8 4, 0x10
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    L,   L,   -4
++#else
++    /* number of values in B */
++    PTR_ADDI    L,   L,   -8
++#endif
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    OFF,   OFF,  0x04
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_M2:
++    andi      I,   M,   2
++    beq       ZERO,I,   .L_M1
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  2
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  8
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx8_START 2, 0x08
++
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M2_L7 */
++    beq       ZERO,TL, .L_M2_L7
++.align 5
++.L_M2_TL1:
++    KERNEL8xMx8 2, 0x08
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_M2_TL1
++.L_M2_L7:
++    /* if (!(L & 7)) goto L_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M2_L0
++.align 5
++.L_M2_L71:
++    KERNEL1xMx8 2, 0x08
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M2_L71
++.L_M2_L0:
++    SAVEMx8 2, 0x08
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    L,   L,   -2
++#else
++    /* number of values in B */
++    PTR_ADDI    L,   L,   -8
++#endif
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    OFF,   OFF,  0x02
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_M1:
++    andi      I,   M,   1
++    beq       ZERO,I,   .L_M0
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  1
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  8
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx8_START 1, 0x04
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_M1_L7 */
++    beq       ZERO,TL, .L_M1_L7
++.align 5
++.L_M1_TL1:
++    KERNEL8xMx8 1, 0x04
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_M1_TL1
++.L_M1_L7:
++    /* if (!(L & 7)) goto L_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_M1_L0
++.align 5
++.L_M1_L71:
++    KERNEL1xMx8 1, 0x04
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_M1_L71
++.L_M1_L0:
++    SAVEMx8 1, 0x04
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    L,   L,   -1
++#else
++    /* number of values in B */
++    PTR_ADDI    L,   L,   -8
++#endif
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    /* number of values in A */
++    PTR_ADDI    OFF,   OFF,  0x01
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++.L_M0:
++    /* Add stride for B and C
++     * B += (K * 32)
++     * C += (LDC * 32)
++     */
++    PTR_SLLI    T0,   K,   5
++    PTR_SLLI    T1,   LDC, 5
++    PTR_ADD     B,    B,   T0
++    PTR_ADD     C,    C,   T1
++#if defined(TRMMKERNEL) && !defined(LEFT)
++    PTR_ADDI    OFF,  OFF, 0x08 /* number of values in B */
++#endif
++    blt      ZERO,  J,   .L_N8
++
++.L_N7:
++    andi     J,    N,   4
++    beq      ZERO, J,   .L_N3
++.L_N4:
++    move     C0,    C
++    move     A0,    A
++    PTR_SLLI   T0,    LDC,   2
++#if __loongarch_grlen == 64
++    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
++#elif __loongarch_grlen == 32
++    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
++#else
++    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
++#endif
++
++#if defined(TRMMKERNEL) && defined(LEFT)
++    move     OFF,   OFFSET
++#endif
++
++    /* if (!(M >> 4)) goto L_N4_M8 */
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_N4_M8
++.align 5
++.L_N4_M16:
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x06
++    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    B0,    B,    T0 /* B0 += 4 * OFF */
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  16
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1x16x4_START
++
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N4_L7 */
++    beq       ZERO,TL, .L_N4_M16_L7
++.align 5
++.L_N4_M16_TL1: /* TL-- */
++    KERNEL8x16x4
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N4_M16_TL1
++.L_N4_M16_L7:
++    /* if (!(L & 7)) goto L_N4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N4_M16_L0
++.align 5
++.L_N4_M16_L71:
++    KERNEL1x16x4
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N4_M16_L71
++.L_N4_M16_L0:
++    SAVE16x4
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -16
++#else
++    PTR_ADDI    L,   L,   -4
++#endif
++    PTR_SLLI    T0,  L,  0x06
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x10
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++    PTR_ADDI    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_N4_M16
++.L_N4_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_N4_M0
++
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_N4_M4
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  8
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx4_START 8, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N4_M8_L7 */
++    beq       ZERO,TL, .L_N4_M8_L7
++.align 5
++.L_N4_M8_TL1: /* TL-- */
++    KERNEL8xMx4 8, 0x20
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N4_M8_TL1
++.L_N4_M8_L7:
++    /* if (!(L & 7)) goto L_N4_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N4_M8_L0
++.align 5
++.L_N4_M8_L71:
++    KERNEL1xMx4 8, 0x20
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N4_M8_L71
++.L_N4_M8_L0:
++    SAVEMx4 8, 0x20
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -8
++#else
++    PTR_ADDI    L,   L,   -4
++#endif
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x08
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N4_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_N4_M2
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  4
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx4_START 4, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N4_M4_L7 */
++    beq       ZERO,TL, .L_N4_M4_L7
++.align 5
++.L_N4_M4_TL1: /* TL-- */
++    KERNEL8xMx4 4, 0x10
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N4_M4_TL1
++.L_N4_M4_L7:
++    /* if (!(L & 7)) goto L_N4_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N4_M4_L0
++.align 5
++.L_N4_M4_L71:
++    KERNEL1xMx4 4, 0x10
++
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N4_M4_L71
++.L_N4_M4_L0:
++    SAVEMx4 4, 0x10
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -4
++#else
++    PTR_ADDI    L,   L,   -4
++#endif
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x04
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N4_M2:
++    andi      I,   M,   2
++    beq       ZERO,I,   .L_N4_M1
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  2
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx4_START 2, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N4_M2_L7 */
++    beq       ZERO,TL, .L_N4_M2_L7
++.align 5
++.L_N4_M2_TL1: /* TL-- */
++    KERNEL8xMx4 2, 0x08
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N4_M2_TL1
++.L_N4_M2_L7:
++    /* if (!(L & 7)) goto L_N4_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N4_M2_L0
++.align 5
++.L_N4_M2_L71:
++    KERNEL1xMx4 2, 0x08
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N4_M2_L71
++.L_N4_M2_L0:
++    SAVEMx4 2, 0x08
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -2
++#else
++    PTR_ADDI    L,   L,   -4
++#endif
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x02
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N4_M1:
++    andi      I,   M,   1
++    beq       ZERO,I,   .L_N4_M0
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  1
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  4
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx4_START 1, 0x04
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N4_M1_L7 */
++    beq       ZERO,TL, .L_N4_M1_L7
++.align 5
++.L_N4_M1_TL1: /* TL-- */
++    KERNEL8xMx4 1, 0x04
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N4_M1_TL1
++.L_N4_M1_L7:
++    /* if (!(L & 7)) goto L_N4_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N4_M1_L0
++.align 5
++.L_N4_M1_L71:
++    KERNEL1xMx4 1, 0x04
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N4_M1_L71
++.L_N4_M1_L0:
++    SAVEMx4 1, 0x04
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -1
++#else
++    PTR_ADDI    L,   L,   -4
++#endif
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x01
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N4_M0:
++    /* Add stride for B and C
++     * B += 4 * K
++     * C += 4 * LDC
++     */
++    PTR_SLLI    T0,   K,   4
++    PTR_SLLI    T1,   LDC, 4
++    PTR_ADD     B,    B,   T0
++    PTR_ADD     C,    C,   T1
++
++#if defined(TRMMKERNEL) && !defined(LEFT)
++    PTR_ADDI    OFF,  OFF, 0x04
++#endif
++    /* We must reinit I */
++    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
++.L_N3:
++    andi     J,    N,   2
++    beq      ZERO, J,   .L_N1
++
++.L_N2:
++    move     C0,    C
++    move     A0,    A
++    PTR_SLLI   T0,    LDC,   2
++    PTR_ADD    C1,    C0,    T0
++
++#if defined(TRMMKERNEL) && defined(LEFT)
++    move     OFF,   OFFSET
++#endif
++
++    /* if (!(M >> 4)) goto L_N2_M8 */
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_N2_M8
++.align 5
++.L_N2_M16:
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x06
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  16
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1x16x2_START
++
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N2_M16_L7 */
++    beq       ZERO,TL, .L_N2_M16_L7
++.align 5
++.L_N2_M16_TL1: /* TL-- */
++    KERNEL8x16x2
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N2_M16_TL1
++.L_N2_M16_L7:
++    /* if (!(L & 7)) goto L_N2_M16_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N2_M16_L0
++.align 5
++.L_N2_M16_L71:
++    KERNEL1x16x2
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N2_M16_L71
++.L_N2_M16_L0:
++    SAVE16x2
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -16
++#else
++    PTR_ADDI    L,   L,   -2
++#endif
++    PTR_SLLI    T0,  L,  0x06
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x10
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++    PTR_ADDI    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_N2_M16
++.L_N2_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_N2_M0
++
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_N2_M4
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  8
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx2_START 8, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N2_M8_L7 */
++    beq       ZERO,TL, .L_N2_M8_L7
++.align 5
++.L_N2_M8_TL1: /* TL-- */
++    KERNEL8xMx2 8, 0x20
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N2_M8_TL1
++.L_N2_M8_L7:
++    /* if (!(L & 7)) goto L_N2_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N2_M8_L0
++.align 5
++.L_N2_M8_L71:
++    KERNEL1xMx2 8, 0x20
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N2_M8_L71
++.L_N2_M8_L0:
++    SAVEMx2 8, 0x20
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -8
++#else
++    PTR_ADDI    L,   L,   -2
++#endif
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x08
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N2_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_N2_M2
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  4
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx2_START 4, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N2_M4_L7 */
++    beq       ZERO,TL, .L_N2_M4_L7
++.align 5
++.L_N2_M4_TL1: /* TL-- */
++    KERNEL8xMx2 4, 0x10
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N2_M4_TL1
++.L_N2_M4_L7:
++    /* if (!(L & 7)) goto L_N2_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N2_M4_L0
++.align 5
++.L_N2_M4_L71:
++    KERNEL1xMx2 4, 0x10
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N2_M4_L71
++.L_N2_M4_L0:
++    SAVEMx2 4, 0x10
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -4
++#else
++    PTR_ADDI    L,   L,   -2
++#endif
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x04
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N2_M2:
++    andi      I,   M,   2
++    beq       ZERO,I,   .L_N2_M1
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    A0,    A0,   T0
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  2
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx2_START 2, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N2_M2_L7 */
++    beq       ZERO,TL, .L_N2_M2_L7
++.align 5
++.L_N2_M2_TL1: /* TL-- */
++    KERNEL8xMx2 2, 0x08
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N2_M2_TL1
++.L_N2_M2_L7:
++    /* if (!(L & 7)) goto L_N2_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N2_M2_L0
++.align 5
++.L_N2_M2_L71:
++    KERNEL1xMx2 2, 0x08
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N2_M2_L71
++.L_N2_M2_L0:
++    SAVEMx2 2, 0x08
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -2
++#else
++    PTR_ADDI    L,   L,   -2
++#endif
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     A0,  A0, T0
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x02
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N2_M1:
++    andi      I,   M,   1
++    beq       ZERO,I,   .L_N2_M0
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  1
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  2
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx2_START 1, 0x04
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N2_M1_L7 */
++    beq       ZERO,TL, .L_N2_M1_L7
++.align 5
++.L_N2_M1_TL1: /* TL-- */
++    KERNEL8xMx2 1, 0x04
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N2_M1_TL1
++.L_N2_M1_L7:
++    /* if (!(L & 7)) goto L_N2_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N2_M1_L0
++.align 5
++.L_N2_M1_L71:
++    KERNEL1xMx2 1, 0x04
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N2_M1_L71
++.L_N2_M1_L0:
++    SAVEMx2 1, 0x04
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -1
++#else
++    PTR_ADDI    L,   L,   -2
++#endif
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x01
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N2_M0:
++    /* Add stride for B and C
++     * B += 2 * K
++     * C += 2 * LDC
++     */
++    PTR_SLLI    T0,   K,   3
++    PTR_SLLI    T1,   LDC, 3
++    PTR_ADD     B,    B,   T0
++    PTR_ADD     C,    C,   T1
++#if defined(TRMMKERNEL) && !defined(LEFT)
++    PTR_ADDI    OFF,  OFF, 0x02
++#endif
++    /* We must reinit I */
++    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
++.L_N1:
++    andi     J,    N,   1
++    beq      ZERO, J,   .L_N0
++    move     C0,    C
++    move     A0,    A
++
++#if defined(TRMMKERNEL) && defined(LEFT)
++    move     OFF,   OFFSET
++#endif
++    /* if (!(M >> 4)) goto L_N1_M8 */
++    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
++    beq      ZERO,  I,     .L_N1_M8
++.L_N1_M16:
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x06
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  16
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1x16x1_START
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M16_L7 */
++    beq       ZERO,TL, .L_N1_M16_L7
++.align 5
++.L_N1_M16_TL1: /* TL-- */
++    KERNEL8x16x1
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M16_TL1
++.L_N1_M16_L7:
++    /* if (!(L & 7)) goto L_N1_M16_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M16_L0
++.align 5
++.L_N1_M16_L71:
++    KERNEL1x16x1
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M16_L71
++.L_N1_M16_L0:
++    SAVE16x1
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -16
++#else
++    PTR_ADDI    L,   L,   -1
++#endif
++    PTR_SLLI    T0,  L,  0x06
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x10
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++    PTR_ADDI    I,   I,   -1  /* I-- */
++    blt       ZERO,I,   .L_N1_M16
++.L_N1_M8:
++    /* We have done M & 16, considering M=8/4/2/1 */
++    andi      I,   M,   15
++    beq       ZERO,I,   .L_N1_M0
++
++    andi      I,   M,   8
++    beq       ZERO,I,   .L_N1_M4
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x05
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  8
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx1_START 8, 0x20
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M8_L7 */
++    beq       ZERO,TL, .L_N1_M8_L7
++.align 5
++.L_N1_M8_TL1: /* TL-- */
++    KERNEL8xMx1 8, 0x20
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M8_TL1
++.L_N1_M8_L7:
++    /* if (!(L & 7)) goto L_N1_M8_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M8_L0
++.align 5
++.L_N1_M8_L71:
++    KERNEL1xMx1 8, 0x20
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M8_L71
++.L_N1_M8_L0:
++    SAVEMx1 8, 0x20
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -8
++#else
++    PTR_ADDI    L,   L,   -1
++#endif
++    PTR_SLLI    T0,  L,  0x05
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x08
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N1_M4:
++    andi      I,   M,   4
++    beq       ZERO,I,   .L_N1_M2
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x04
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  4
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx1_START 4, 0x10
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M4_L7 */
++    beq       ZERO,TL, .L_N1_M4_L7
++.align 5
++.L_N1_M4_TL1: /* TL-- */
++    KERNEL8xMx1 4, 0x10
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M4_TL1
++.L_N1_M4_L7:
++    /* if (!(L & 7)) goto L_N1_M4_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M4_L0
++.align 5
++.L_N1_M4_L71:
++    KERNEL1xMx1 4, 0x10
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M4_L71
++.L_N1_M4_L0:
++    SAVEMx1 4, 0x10
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -4
++#else
++    PTR_ADDI    L,   L,   -1
++#endif
++    PTR_SLLI    T0,  L,  0x04
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x04
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N1_M2:
++    andi      I,   M,   2
++    beq       ZERO,I,   .L_N1_M1
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x03
++    PTR_ADD    A0,    A0,   T0
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  2
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx1_START 2, 0x08
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M2_L7 */
++    beq       ZERO,TL, .L_N1_M2_L7
++.align 5
++.L_N1_M2_TL1: /* TL-- */
++    KERNEL8xMx1 2, 0x08
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M2_TL1
++.L_N1_M2_L7:
++    /* if (!(L & 7)) goto L_N1_M2_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M2_L0
++.align 5
++.L_N1_M2_L71:
++    KERNEL1xMx1 2, 0x08
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M2_L71
++.L_N1_M2_L0:
++    SAVEMx1 2, 0x08
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -2
++#else
++    PTR_ADDI    L,   L,   -1
++#endif
++    PTR_SLLI    T0,  L,  0x03
++    PTR_ADD     A0,  A0, T0
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x02
++#endif
++#endif   // #if defined(TRMMKERNEL)
++
++.L_N1_M1:
++    andi      I,   M,   1
++    beq       ZERO,I,   .L_N1_M0
++
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    move     B0,    B
++#else
++    PTR_SLLI   T0,    OFF,  0x02
++    PTR_ADD    A0,    A0,   T0
++    PTR_ADD    B0,    B,    T0
++#endif
++
++#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
++    PTR_SUB    L,     K,    OFF
++#elif defined(LEFT)
++    /* number of values in A */
++    PTR_ADDI   L,     OFF,  1
++#else
++    /* number of values in B */
++    PTR_ADDI   L,     OFF,  1
++#endif
++#else   // #if !defined(TRMMKERNEL)
++    move     B0,    B
++    move     L,     K /* L = bk */
++#endif
++    KERNEL1xMx1_START 1, 0x04
++    /* Reduce L */
++    PTR_ADDI    L,   L,  -1
++    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
++    /* if (TL < 1) goto L_N1_M1_L7 */
++    beq       ZERO,TL, .L_N1_M1_L7
++.align 5
++.L_N1_M1_TL1: /* TL-- */
++    KERNEL8xMx1 1, 0x04
++
++    PTR_ADDI    TL,  TL, -1 /* TL-- */
++    blt       ZERO,TL, .L_N1_M1_TL1
++.L_N1_M1_L7:
++    /* if (!(L & 7)) goto L_N1_M1_L0 */
++    andi      TL,  L,   7
++    beq       TL,  ZERO,.L_N1_M1_L0
++.align 5
++.L_N1_M1_L71:
++    KERNEL1xMx1 1, 0x04
++    PTR_ADDI     TL,  TL, -1
++    blt        ZERO,TL, .L_N1_M1_L71
++.L_N1_M1_L0:
++    SAVEMx1 1, 0x04
++#if defined(TRMMKERNEL)
++#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
++    PTR_SUB     L,   K,   OFF
++#ifdef LEFT
++    PTR_ADDI    L,   L,   -1
++#else
++    PTR_ADDI    L,   L,   -1
++#endif
++    PTR_SLLI    T0,  L,  0x02
++    PTR_ADD     A0,  A0, T0
++    PTR_ADD     B0,  B0, T0
++#endif
++
++#ifdef LEFT
++    PTR_ADDI    OFF,   OFF,  0x01
++#endif
++#endif   // #if defined(TRMMKERNEL)
++.L_N1_M0:
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
+new file mode 100644
+index 000000000..266c07c5c
+--- /dev/null
++++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
+@@ -0,0 +1,463 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/23 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*********************************************************************/
++
++/* Function parameters */
++#define M      $r4    // param 1: m
++#define N      $r5    // param 2: n
++#define SRC    $r6    // param 3: src
++#define LDA    $r7    // param 4: lda
++#define DST    $r8    // param 5: dst
++
++#define I      $r9
++#define J      $r10
++#define S1     $r12
++#define S2     $r13
++#define S3     $r14
++#define S4     $r15
++#define S5     $r16
++#define S6     $r17
++#define S7     $r18
++#define S8     $r19
++#define S9     $r20
++#define S10    $r23
++#define S11    $r24
++#define S12    $r25
++#define S13    $r26
++#define S14    $r27
++#define S15    $r28
++#define S16    $r29
++#define TD     $r30
++#define TS     $r31
++#define TL     $r7
++#define T0     $r6
++#undef  ZERO
++#define ZERO   $r0
++
++#define F0     $f0
++#define F1     $f1
++#define F2     $f2
++#define F3     $f3
++#define F4     $f4
++#define F5     $f5
++#define F6     $f6
++#define F7     $f7
++/* LASX vectors */
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define U8     $xr8
++#define U9     $xr9
++#define U10    $xr10
++#define U11    $xr11
++#define U12    $xr12
++#define U13    $xr13
++#define U14    $xr14
++#define U15    $xr15
++#define D0     $xr16
++#define D1     $xr17
++#define D2     $xr18
++#define D3     $xr19
++#define D4     $xr20
++#define D5     $xr21
++#define D6     $xr22
++#define D7     $xr23
++#define D8     $xr24
++#define D9     $xr25
++#define D10    $xr26
++#define D11    $xr27
++#define D12    $xr28
++#define D13    $xr29
++#define D14    $xr30
++#define D15    $xr31
++
++// Loops outline
++//.L_N16 <-------------------
++//|   .L_M8:                |
++//|   .L_M7:                | Main Loop
++//|   .L_M1:                |
++//|   .L_M0: ---------------
++//.L_N15:
++//.L_N8:
++//|   .L_N8_M8:
++//|   .L_N8_M7:
++//|   .L_N8_M1:
++//.L_N7:
++//.L_N4:
++//|   .L_N4_M4:
++//|   .L_N4_M3:
++//|   .L_N4_M1:
++//.L_N3:
++//.L_N2:
++//|   .L_N2_M2:
++//|   .L_N2_M1:
++//.L_N1:
++//|   .L_N1_M1:
++//.L_N0
++
++    PROLOGUE
++    push_if_used 26, 32
++
++    move       TD,   DST
++    move       TS,   SRC
++    PTR_SLLI   TL,   LDA,  0x02
++    PTR_SLLI   T0,   TL,   0x01
++    PTR_SRAI   J,    N,    0x04
++    beq        J,    ZERO, .L_N15
++.align 5
++.L_N16:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x03
++    PTR_ADD      S3,   S2,   TL
++    PTR_ADDI     J,    J,    -1
++    PTR_ADD      S4,   S3,   TL
++    PTR_ADD      S5,   S3,   T0
++    PTR_ADD      S6,   S4,   T0
++    PTR_ADD      S7,   S5,   T0
++    PTR_ADD      S8,   S6,   T0
++    PTR_ADD      S9,   S7,   T0
++    PTR_ADD      S10,  S8,   T0
++    PTR_ADD      S11,  S9,   T0
++    PTR_ADD      S12,  S10,  T0
++    PTR_ADD      S13,  S11,  T0
++    PTR_ADD      S14,  S12,  T0
++    PTR_ADD      S15,  S13,  T0
++    PTR_ADD      S16,  S14,  T0
++    PTR_ADD      TS,   S15,  T0
++    beq          I,    ZERO, .L_M7
++.align 5
++.L_M8:
++    xvld       U0,   S1,   0x00
++    xvld       U1,   S2,   0x00
++    xvld       U2,   S3,   0x00
++    xvld       U3,   S4,   0x00
++    xvld       U4,   S5,   0x00
++    xvld       U5,   S6,   0x00
++    xvld       U6,   S7,   0x00
++    xvld       U7,   S8,   0x00
++    xvld       U8,   S9,   0x00
++    xvld       U9,   S10,  0x00
++    xvld       U10,  S11,  0x00
++    xvld       U11,  S12,  0x00
++    xvld       U12,  S13,  0x00
++    xvld       U13,  S14,  0x00
++    xvld       U14,  S15,  0x00
++    xvld       U15,  S16,  0x00
++
++    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
++                    U0, U1, U2, U3, U4, U5, U6, U7, \
++                    D1, D3, D5, D7 // As tmp
++    GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
++                    U8, U9, U10, U11, U12, U13, U14, U15, \
++                    U0, U1, U2, U3 // As tmp
++    GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
++              D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
++    PTR_ADDI  TD, TD, 0x100
++    GST xv, , D8,  TD, 0x00, D9,  TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
++              D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
++    PTR_ADDI  TD, TD, 0x100
++    PTR_ADDI     S1,   S1,   0x20
++    PTR_ADDI     S2,   S2,   0x20
++    PTR_ADDI     S3,   S3,   0x20
++    PTR_ADDI     S4,   S4,   0x20
++    PTR_ADDI     S5,   S5,   0x20
++    PTR_ADDI     S6,   S6,   0x20
++    PTR_ADDI     S7,   S7,   0x20
++    PTR_ADDI     S8,   S8,   0x20
++    PTR_ADDI     S9,   S9,   0x20
++    PTR_ADDI     S10,  S10,  0x20
++    PTR_ADDI     S11,  S11,  0x20
++    PTR_ADDI     S12,  S12,  0x20
++    PTR_ADDI     S13,  S13,  0x20
++    PTR_ADDI     S14,  S14,  0x20
++    PTR_ADDI     S15,  S15,  0x20
++    PTR_ADDI     S16,  S16,  0x20
++
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_M8
++.L_M7:
++    andi      I,     M,    0x07
++    beq       I,     ZERO, .L_M0
++.align 5
++.L_M1:
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++    fld.s     F2,    S3,  0x00
++    fld.s     F3,    S4,  0x00
++    fld.s     F4,    S5,  0x00
++    fld.s     F5,    S6,  0x00
++    fld.s     F6,    S7,  0x00
++    fld.s     F7,    S8,  0x00
++
++    fst.s     F0,    TD,  0x00
++    fst.s     F1,    TD,  0x04
++    fst.s     F2,    TD,  0x08
++    fst.s     F3,    TD,  0x0C
++    fst.s     F4,    TD,  0x10
++    fst.s     F5,    TD,  0x14
++    fst.s     F6,    TD,  0x18
++    fst.s     F7,    TD,  0x1C
++
++    PTR_ADDI    S1,    S1,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    PTR_ADDI    S3,    S3,  0x04
++    PTR_ADDI    S4,    S4,  0x04
++    PTR_ADDI    S5,    S5,  0x04
++    PTR_ADDI    S6,    S6,  0x04
++    PTR_ADDI    S7,    S7,  0x04
++    PTR_ADDI    S8,    S8,  0x04
++    PTR_ADDI    TD,    TD,  0x20
++
++    fld.s     F0,    S9,  0x00
++    fld.s     F1,    S10, 0x00
++    fld.s     F2,    S11, 0x00
++    fld.s     F3,    S12, 0x00
++    fld.s     F4,    S13, 0x00
++    fld.s     F5,    S14, 0x00
++    fld.s     F6,    S15, 0x00
++    fld.s     F7,    S16, 0x00
++
++    fst.s     F0,    TD,  0x00
++    fst.s     F1,    TD,  0x04
++    fst.s     F2,    TD,  0x08
++    fst.s     F3,    TD,  0x0C
++    fst.s     F4,    TD,  0x10
++    fst.s     F5,    TD,  0x14
++    fst.s     F6,    TD,  0x18
++    fst.s     F7,    TD,  0x1C
++
++    PTR_ADDI    S9,    S9,  0x04
++    PTR_ADDI    S10,   S10, 0x04
++    PTR_ADDI    S11,   S11, 0x04
++    PTR_ADDI    S12,   S12, 0x04
++    PTR_ADDI    S13,   S13, 0x04
++    PTR_ADDI    S14,   S14, 0x04
++    PTR_ADDI    S15,   S15, 0x04
++    PTR_ADDI    S16,   S16, 0x04
++    PTR_ADDI    TD,    TD,  0x20
++
++    PTR_ADDI    I,     I,   -1
++    blt         ZERO,  I,   .L_M1
++.L_M0:
++    blt       ZERO,  J,   .L_N16
++.L_N15:
++    andi      J,     N,   0x0f
++    beq       ZERO,  J,   .L_N0
++
++    andi      J,     N,   0x08
++    beq       ZERO,  J,   .L_N7
++.L_N8:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x03
++    PTR_ADD      S3,   S2,   TL
++    PTR_ADD      S4,   S2,   T0
++    PTR_ADD      S5,   S3,   T0
++    PTR_ADD      S6,   S4,   T0
++    PTR_ADD      S7,   S5,   T0
++    PTR_ADD      S8,   S6,   T0
++    PTR_ADD      TS,   S7,   T0
++    beq          I,    ZERO, .L_N8_M7
++.align 5
++.L_N8_M8:
++    xvld       U0,   S1,   0x00
++    xvld       U1,   S2,   0x00
++    xvld       U2,   S3,   0x00
++    xvld       U3,   S4,   0x00
++    xvld       U4,   S5,   0x00
++    xvld       U5,   S6,   0x00
++    xvld       U6,   S7,   0x00
++    xvld       U7,   S8,   0x00
++
++    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
++                    U0, U1, U2, U3, U4, U5, U6, U7, \
++                    D1, D3, D5, D7 // As tmp
++    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
++              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
++    PTR_ADDI  TD, TD, 0x100
++    PTR_ADDI     S1,   S1,   0x20
++    PTR_ADDI     S2,   S2,   0x20
++    PTR_ADDI     S3,   S3,   0x20
++    PTR_ADDI     S4,   S4,   0x20
++    PTR_ADDI     S5,   S5,   0x20
++    PTR_ADDI     S6,   S6,   0x20
++    PTR_ADDI     S7,   S7,   0x20
++    PTR_ADDI     S8,   S8,   0x20
++
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_N8_M8
++.L_N8_M7:
++    andi      I,     M,    0x07
++    beq       I,     ZERO, .L_N7
++.align 5
++.L_N8_M1:
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++    fld.s     F2,    S3,  0x00
++    fld.s     F3,    S4,  0x00
++    fld.s     F4,    S5,  0x00
++    fld.s     F5,    S6,  0x00
++    fld.s     F6,    S7,  0x00
++    fld.s     F7,    S8,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    fst.s       F2,    TD,  0x08
++    PTR_ADDI    S3,    S3,  0x04
++    fst.s       F3,    TD,  0x0C
++    PTR_ADDI    S4,    S4,  0x04
++    fst.s       F4,    TD,  0x10
++    PTR_ADDI    S5,    S5,  0x04
++    fst.s       F5,    TD,  0x14
++    PTR_ADDI    S6,    S6,  0x04
++    fst.s       F6,    TD,  0x18
++    PTR_ADDI    S7,    S7,  0x04
++    fst.s       F7,    TD,  0x1C
++    PTR_ADDI    S8,    S8,  0x04
++
++    PTR_ADDI    TD,    TD,  0x20
++    PTR_ADDI    I,     I,   -1
++    blt         ZERO,  I,   .L_N8_M1
++.L_N7:
++    andi      J,     N,   0x07
++    beq       ZERO,  J,   .L_N0
++
++    andi      J,     N,   0x04
++    beq       ZERO,  J,   .L_N3
++.L_N4:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x02
++    PTR_ADD      S3,   S2,   TL
++    PTR_ADD      S4,   S2,   T0
++    PTR_ADD      TS,   S3,   T0
++    beq          I,    ZERO, .L_N4_M3
++.align 5
++.L_N4_M4:
++    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
++    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
++    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
++    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
++    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
++    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
++    PTR_ADDI     S1,   S1,   0x10
++    PTR_ADDI     S2,   S2,   0x10
++    PTR_ADDI     S3,   S3,   0x10
++    PTR_ADDI     S4,   S4,   0x10
++    PTR_ADDI     TD,   TD,   0x40
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_N4_M4
++.L_N4_M3:
++    andi      I,     M,    0x03
++    beq       I,     ZERO, .L_N3
++.align 5
++.L_N4_M1:
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++    fld.s     F2,    S3,  0x00
++    fld.s     F3,    S4,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    fst.s       F2,    TD,  0x08
++    PTR_ADDI    S3,    S3,  0x04
++    fst.s       F3,    TD,  0x0C
++    PTR_ADDI    S4,    S4,  0x04
++
++    PTR_ADDI    TD,    TD,  0x10
++    PTR_ADDI    I,     I,   -1
++    blt         ZERO,  I,   .L_N4_M1
++.L_N3:
++    andi      J,     N,   0x03
++    beq       ZERO,  J,   .L_N0
++
++    andi      J,     N,   0x02
++    beq       ZERO,  J,   .L_N1
++.L_N2:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x01
++    PTR_ADD      TS,   S2,   TL
++    beq          I,    ZERO, .L_N2_M1
++.align 5
++.L_N2_M2:
++    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
++    vilvl.w $vr0, $vr1, $vr0
++    GST v, , $vr0, TD, 0x00
++    PTR_ADDI     S1,   S1,   0x08
++    PTR_ADDI     S2,   S2,   0x08
++    PTR_ADDI     TD,   TD,   0x10
++
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_N2_M2
++.L_N2_M1:
++    andi      I,     M,    0x01
++    beq       I,     ZERO, .L_N1
++
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    PTR_ADDI    TD,    TD,  0x08
++.align 5
++.L_N1:
++    move      S1,    TS
++    beq       ZERO,  M,   .L_N0
++.L_N1_M1:
++    fld.s       F0,    S1,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    TD,    TD,  0x04
++    PTR_ADDI    M,     M,   -1
++    blt         ZERO,  M,   .L_N1_M1
++.L_N0:
++    pop_if_used 26, 32
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
+new file mode 100644
+index 000000000..5c173568b
+--- /dev/null
++++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
+@@ -0,0 +1,298 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/23 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*********************************************************************/
++
++/* Function parameters */
++#define M      $r4    // param 1: m
++#define N      $r5    // param 2: n
++#define SRC    $r6    // param 3: src
++#define LDA    $r7    // param 4: lda
++#define DST    $r8    // param 5: dst
++
++#define I      $r9
++#define J      $r10
++#define S1     $r12
++#define S2     $r13
++#define S3     $r14
++#define S4     $r15
++#define S5     $r16
++#define S6     $r17
++#define S7     $r18
++#define S8     $r19
++#define TD     $r20
++#define TS     $r11
++#define TL     $r7
++#define T0     $r6
++#undef  ZERO
++#define ZERO   $r0
++
++#define F0     $f0
++#define F1     $f1
++#define F2     $f2
++#define F3     $f3
++#define F4     $f4
++#define F5     $f5
++#define F6     $f6
++#define F7     $f7
++/* LASX vectors */
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++#define D0     $xr8
++#define D1     $xr9
++#define D2     $xr10
++#define D3     $xr11
++#define D4     $xr12
++#define D5     $xr13
++#define D6     $xr14
++#define D7     $xr15
++#define D8     $xr16
++#define D10    $xr17
++#define D12    $xr18
++#define D14    $xr19
++
++// Loops outline
++//.L_N8: <----------------
++//|   .L_M8:              |
++//|   .L_M7:              | Main Loop
++//|   .L_M1:              |
++//|   .L_M0:--------------
++//.L_N7:
++//.L_N4:
++//|   .L_N4_M4:
++//|   .L_N4_M3:
++//|   .L_N4_M1:
++//.L_N3:
++//.L_N2:
++//|   .L_N2_M2:
++//|   .L_N2_M1:
++//.L_N1:
++//|   .L_N1_M1:
++//.L_N0
++
++    PROLOGUE
++    push_if_used 17, 20
++
++    move       TD,   DST
++    move       TS,   SRC
++    PTR_SLLI   TL,   LDA,  0x02
++    PTR_SLLI   T0,   TL,   0x01
++    PTR_SRAI   J,    N,    0x03
++    beq        J,    ZERO, .L_N7
++.align 5
++.L_N8:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x03
++    PTR_ADD      S3,   S2,   TL
++    PTR_ADDI     J,    J,    -1
++    PTR_ADD      S4,   S2,   T0
++    PTR_ADD      S5,   S3,   T0
++    PTR_ADD      S6,   S4,   T0
++    PTR_ADD      S7,   S5,   T0
++    PTR_ADD      S8,   S6,   T0
++    PTR_ADD      TS,   S7,   T0
++    beq          I,    ZERO, .L_M7
++.align 5
++.L_M8:
++    xvld       U0,   S1,   0x00
++    xvld       U1,   S2,   0x00
++    xvld       U2,   S3,   0x00
++    xvld       U3,   S4,   0x00
++    xvld       U4,   S5,   0x00
++    xvld       U5,   S6,   0x00
++    xvld       U6,   S7,   0x00
++    xvld       U7,   S8,   0x00
++
++    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
++                    U0, U1, U2, U3, U4, U5, U6, U7, \
++                    D1, D3, D5, D7 // As tmp
++    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
++              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
++    PTR_ADDI  TD, TD, 0x100
++    PTR_ADDI     S1,   S1,   0x20
++    PTR_ADDI     S2,   S2,   0x20
++    PTR_ADDI     S3,   S3,   0x20
++    PTR_ADDI     S4,   S4,   0x20
++    PTR_ADDI     S5,   S5,   0x20
++    PTR_ADDI     S6,   S6,   0x20
++    PTR_ADDI     S7,   S7,   0x20
++    PTR_ADDI     S8,   S8,   0x20
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_M8
++.L_M7:
++    andi      I,     M,    0x07
++    beq       I,     ZERO, .L_M0
++.align 5
++.L_M1:
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++    fld.s     F2,    S3,  0x00
++    fld.s     F3,    S4,  0x00
++    fld.s     F4,    S5,  0x00
++    fld.s     F5,    S6,  0x00
++    fld.s     F6,    S7,  0x00
++    fld.s     F7,    S8,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    fst.s       F2,    TD,  0x08
++    PTR_ADDI    S3,    S3,  0x04
++    fst.s       F3,    TD,  0x0C
++    PTR_ADDI    S4,    S4,  0x04
++    fst.s       F4,    TD,  0x10
++    PTR_ADDI    S5,    S5,  0x04
++    fst.s       F5,    TD,  0x14
++    PTR_ADDI    S6,    S6,  0x04
++    fst.s       F6,    TD,  0x18
++    PTR_ADDI    S7,    S7,  0x04
++    fst.s       F7,    TD,  0x1C
++    PTR_ADDI    S8,    S8,  0x04
++
++    PTR_ADDI    TD,    TD,  0x20
++    PTR_ADDI    I,     I,   -1
++    blt         ZERO,  I,   .L_M1
++.L_M0:
++    blt       ZERO,  J,   .L_N8
++.L_N7:
++    andi      J,     N,   0x07
++    beq       ZERO,  J,   .L_N0
++
++    andi      J,     N,   0x04
++    beq       ZERO,  J,   .L_N3
++.L_N4:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x02
++    PTR_ADD      S3,   S2,   TL
++    PTR_ADD      S4,   S2,   T0
++    PTR_ADD      TS,   S3,   T0
++    beq          I,    ZERO, .L_N4_M3
++.align 5
++.L_N4_M4:
++    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
++    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
++    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
++    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
++    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
++    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
++    PTR_ADDI     S1,   S1,   0x10
++    PTR_ADDI     S2,   S2,   0x10
++    PTR_ADDI     S3,   S3,   0x10
++    PTR_ADDI     S4,   S4,   0x10
++    PTR_ADDI     TD,   TD,   0x40
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_N4_M4
++.L_N4_M3:
++    andi      I,     M,    0x03
++    beq       I,     ZERO, .L_N3
++.align 5
++.L_N4_M1:
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++    fld.s     F2,    S3,  0x00
++    fld.s     F3,    S4,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    fst.s       F2,    TD,  0x08
++    PTR_ADDI    S3,    S3,  0x04
++    fst.s       F3,    TD,  0x0C
++    PTR_ADDI    S4,    S4,  0x04
++
++    PTR_ADDI    TD,    TD,  0x10
++    PTR_ADDI    I,     I,   -1
++    blt         ZERO,  I,   .L_N4_M1
++.L_N3:
++    andi      J,     N,   0x03
++    beq       ZERO,  J,   .L_N0
++
++    andi      J,     N,   0x02
++    beq       ZERO,  J,   .L_N1
++.L_N2:
++    move         S1,   TS
++    PTR_ADD      S2,   TS,   TL
++    PTR_SRAI     I,    M,    0x01
++    PTR_ADD      TS,   S2,   TL
++    beq          I,    ZERO, .L_N2_M1
++.align 5
++.L_N2_M2:
++    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
++    vilvl.w $vr0, $vr1, $vr0
++    GST v, , $vr0, TD, 0x00
++    PTR_ADDI     S1,   S1,   0x08
++    PTR_ADDI     S2,   S2,   0x08
++    PTR_ADDI     TD,   TD,   0x10
++
++    PTR_ADDI     I,    I,    -1
++    blt          ZERO, I,    .L_N2_M2
++.L_N2_M1:
++    andi      I,     M,    0x01
++    beq       I,     ZERO, .L_N1
++
++    fld.s     F0,    S1,  0x00
++    fld.s     F1,    S2,  0x00
++
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F1,    TD,  0x04
++    PTR_ADDI    S2,    S2,  0x04
++    PTR_ADDI    TD,    TD,  0x08
++.align 5
++.L_N1:
++    move      S1,    TS
++    beq       ZERO,  M,   .L_N0
++.L_N1_M1:
++    fld.s       F0,    S1,  0x00
++    PTR_ADDI    S1,    S1,  0x04
++    fst.s       F0,    TD,  0x00
++    PTR_ADDI    TD,    TD,  0x04
++    PTR_ADDI    M,     M,   -1
++    blt         ZERO,  M,   .L_N1_M1
++.L_N0:
++    pop_if_used 17, 20
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
+new file mode 100644
+index 000000000..d9789bdcd
+--- /dev/null
++++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
+@@ -0,0 +1,526 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/23 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*********************************************************************/
++
++/* Function parameters */
++#define M      $r4    // param 1: m
++#define N      $r5    // param 2: n
++#define SRC    $r6    // param 3: src
++#define LDA    $r7    // param 4: lda
++#define DST    $r8    // param 5: dst
++
++#define I      $r9
++#define J      $r10
++#define S0     $r11
++#define S1     $r12
++#define S2     $r13
++#define S3     $r14
++#define S4     $r15
++#define S5     $r16
++#define S6     $r17
++#define S7     $r18
++#define S8     $r19
++#define P0     $r20
++#define P1     $r23
++#define P2     $r24
++#define P3     $r25
++#define P4     $r26
++#define P5     $r27
++#define T0     $r28
++#define T1     $r29
++#define TL     $r7
++#define ZERO   $r0
++
++/* LASX vectors */
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++
++// Loops outline
++//.L_M8 <-------------------
++//|   .L_N16:               |
++//|   .L_N15:               |
++//|   .L_N8:                |
++//|   .L_N7:                | Main Loop
++//|   .L_N4:                |
++//|   .L_N3:                |
++//|   .L_N2:                |
++//|   .L_N1:                |
++//|   .L_N0: ---------------
++//.L_M7
++//.L_M4
++//|   .L_M4_N16:
++//|   .L_M4_N15:
++//|   .L_M4_N8:
++//|   .L_M4_N7:
++//|   .L_M4_N4:
++//|   .L_M4_N3:
++//|   .L_M4_N2:
++//|   .L_M4_N1:
++//.L_M3
++//.L_M2
++//|   .L_M2_N16:
++//|   .L_M2_N15:
++//|   .L_M2_N8:
++//|   .L_M2_N7:
++//|   .L_M2_N4:
++//|   .L_M2_N3:
++//|   .L_M2_N2:
++//|   .L_M2_N1:
++//.L_M1
++//|   .L_M1_N16:
++//|   .L_M1_N15:
++//|   .L_M1_N8:
++//|   .L_M1_N7:
++//|   .L_M1_N4:
++//|   .L_M1_N3:
++//|   .L_M1_N2:
++//|   .L_M1_N1:
++//.L_M0
++
++    PROLOGUE
++    push_if_used 24, 8
++
++    move       S0,     SRC
++    move       P0,     DST
++
++    PTR_SRAI     T0,     N,     0x04
++    PTR_SRAI     T1,     N,     0x03
++    PTR_SLLI     T0,     T0,    0x04
++    PTR_SLLI     T1,     T1,    0x03
++
++    PTR_MUL      P2,     M,     T0
++    PTR_MUL      P3,     M,     T1
++    PTR_SLLI     P2,     P2,    0x02
++    PTR_SLLI     P3,     P3,    0x02
++    PTR_ADD      P2,     DST,   P2
++    PTR_ADD      P3,     DST,   P3
++
++    PTR_SRAI     T0,     N,     0x02
++    PTR_SRAI     T1,     N,     0x01
++    PTR_SLLI     T0,     T0,    0x02
++    PTR_SLLI     T1,     T1,    0x01
++    PTR_MUL      P4,     M,     T0
++    PTR_MUL      P5,     M,     T1
++    PTR_SLLI     P4,     P4,    0x02
++    PTR_SLLI     P5,     P5,    0x02
++    PTR_ADD      P4,     DST,   P4
++    PTR_ADD      P5,     DST,   P5
++
++    PTR_SLLI     TL,     LDA,   0x02
++    PTR_SRAI     J,      M,     0x03
++    PTR_SLLI     T0,     TL,    0x01
++    PTR_SLLI     T1,     M,     0x06
++    beq          ZERO,   J,     .L_M7
++.align 5
++.L_M8:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S3,     S1,    T0
++    PTR_ADD      S4,     S2,    T0
++    PTR_ADD      S5,     S3,    T0
++    PTR_ADD      S6,     S4,    T0
++    PTR_ADD      S7,     S5,    T0
++    PTR_ADD      S8,     S6,    T0
++    PTR_ADD      S0,     S7,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x200
++
++    PTR_SRAI     I,      N,     0x04
++    PTR_ADDI     J,      J,     -1
++    beq          ZERO,   I,     .L_N15
++.L_N16:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S1,    0x20
++    xvld       U2,     S2,    0x00
++    xvld       U3,     S2,    0x20
++
++    xvst       U0,     P1,    0x00
++    xvst       U1,     P1,    0x20
++    xvst       U2,     P1,    0x40
++    xvst       U3,     P1,    0x60
++
++    xvld       U4,     S3,    0x00
++    xvld       U5,     S3,    0x20
++    xvld       U6,     S4,    0x00
++    xvld       U7,     S4,    0x20
++
++    xvst       U4,     P1,    0x80
++    xvst       U5,     P1,    0xA0
++    xvst       U6,     P1,    0xC0
++    xvst       U7,     P1,    0xE0
++
++    xvld       U0,     S5,    0x00
++    xvld       U1,     S5,    0x20
++    xvld       U2,     S6,    0x00
++    xvld       U3,     S6,    0x20
++
++    xvst       U0,     P1,    0x100
++    xvst       U1,     P1,    0x120
++    xvst       U2,     P1,    0x140
++    xvst       U3,     P1,    0x160
++
++    xvld       U4,     S7,    0x00
++    xvld       U5,     S7,    0x20
++    xvld       U6,     S8,    0x00
++    xvld       U7,     S8,    0x20
++
++    xvst       U4,     P1,    0x180
++    xvst       U5,     P1,    0x1A0
++    xvst       U6,     P1,    0x1C0
++    xvst       U7,     P1,    0x1E0
++
++    PTR_ADDI     S1,     S1,    0x40
++    PTR_ADDI     S2,     S2,    0x40
++    PTR_ADDI     S3,     S3,    0x40
++    PTR_ADDI     S4,     S4,    0x40
++    PTR_ADDI     S5,     S5,    0x40
++    PTR_ADDI     S6,     S6,    0x40
++    PTR_ADDI     S7,     S7,    0x40
++    PTR_ADDI     S8,     S8,    0x40
++
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_N16
++.L_N15:
++    andi       I,      N,     0x08
++    beq        ZERO,   I,     .L_N7
++.L_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++    xvld       U2,     S3,    0x00
++    xvld       U3,     S4,    0x00
++    xvld       U4,     S5,    0x00
++    xvld       U5,     S6,    0x00
++    xvld       U6,     S7,    0x00
++    xvld       U7,     S8,    0x00
++
++    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
++              U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     S3,     S3,    0x20
++    PTR_ADDI     S4,     S4,    0x20
++    PTR_ADDI     S5,     S5,    0x20
++    PTR_ADDI     S6,     S6,    0x20
++    PTR_ADDI     S7,     S7,    0x20
++    PTR_ADDI     S8,     S8,    0x20
++    PTR_ADDI     P2,     P2,    0x100
++.L_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_N3
++.L_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
++             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
++    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
++             $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     S3,     S3,    0x10
++    PTR_ADDI     S4,     S4,    0x10
++    PTR_ADDI     S5,     S5,    0x10
++    PTR_ADDI     S6,     S6,    0x10
++    PTR_ADDI     S7,     S7,    0x10
++    PTR_ADDI     S8,     S8,    0x10
++    PTR_ADDI     P3,     P3,    0x80
++.L_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_N1
++.L_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
++              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
++    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
++              $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     S3,     S3,    0x08
++    PTR_ADDI     S4,     S4,    0x08
++    PTR_ADDI     S5,     S5,    0x08
++    PTR_ADDI     S6,     S6,    0x08
++    PTR_ADDI     S7,     S7,    0x08
++    PTR_ADDI     S8,     S8,    0x08
++    PTR_ADDI     P4,     P4,    0x40
++.L_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_N0
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
++              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
++    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
++              $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     S3,     S3,    0x04
++    PTR_ADDI     S4,     S4,    0x04
++    PTR_ADDI     S5,     S5,    0x04
++    PTR_ADDI     S6,     S6,    0x04
++    PTR_ADDI     S7,     S7,    0x04
++    PTR_ADDI     S8,     S8,    0x04
++    PTR_ADDI     P5,     P5,    0x20
++.L_N0:
++    blt        ZERO,   J,     .L_M8
++.L_M7:
++    andi       J,      M,     0x04
++    beq        ZERO,   J,     .L_M3
++.L_M4:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S3,     S1,    T0
++    PTR_ADD      S4,     S2,    T0
++    PTR_ADD      S0,     S3,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x100
++
++    PTR_SRAI     I,      N,     0x04
++    beq          ZERO,   I,     .L_M4_N15
++.align 5
++.L_M4_N16:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S1,    0x20
++    xvld       U2,     S2,    0x00
++    xvld       U3,     S2,    0x20
++
++    xvst       U0,     P1,    0x00
++    xvst       U1,     P1,    0x20
++    xvst       U2,     P1,    0x40
++    xvst       U3,     P1,    0x60
++
++    xvld       U4,     S3,    0x00
++    xvld       U5,     S3,    0x20
++    xvld       U6,     S4,    0x00
++    xvld       U7,     S4,    0x20
++
++    xvst       U4,     P1,    0x80
++    xvst       U5,     P1,    0xA0
++    xvst       U6,     P1,    0xC0
++    xvst       U7,     P1,    0xE0
++
++    PTR_ADDI     S1,     S1,    0x40
++    PTR_ADDI     S2,     S2,    0x40
++    PTR_ADDI     S3,     S3,    0x40
++    PTR_ADDI     S4,     S4,    0x40
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M4_N16
++.L_M4_N15:
++    andi       I,      N,     0x08
++    beq        ZERO,   I,     .L_M4_N7
++.L_M4_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++    xvld       U2,     S3,    0x00
++    xvld       U3,     S4,    0x00
++
++    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     S3,     S3,    0x20
++    PTR_ADDI     S4,     S4,    0x20
++    PTR_ADDI     P2,     P2,    0x80
++.L_M4_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M4_N3
++.L_M4_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
++    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     S3,     S3,    0x10
++    PTR_ADDI     S4,     S4,    0x10
++    PTR_ADDI     P3,     P3,    0x40
++.L_M4_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M4_N1
++.L_M4_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
++    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     S3,     S3,    0x08
++    PTR_ADDI     S4,     S4,    0x08
++    PTR_ADDI     P4,     P4,    0x20
++.L_M4_N1:
++    andi        I,      N,     0x01
++    beq         ZERO,   I,     .L_M3
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
++    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     S3,     S3,    0x04
++    PTR_ADDI     S4,     S4,    0x04
++    PTR_ADDI     P5,     P5,    0x10
++.L_M3:
++    andi       J,      M,     0x02
++    beq        ZERO,   J,     .L_M1
++.L_M2:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S0,     S0,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x80
++
++    PTR_SRAI     I,      N,     0x04
++    beq          ZERO,   I,     .L_M2_N15
++.align 5
++.L_M2_N16:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S1,    0x20
++    xvld       U2,     S2,    0x00
++    xvld       U3,     S2,    0x20
++
++    xvst       U0,     P1,    0x00
++    xvst       U1,     P1,    0x20
++    xvst       U2,     P1,    0x40
++    xvst       U3,     P1,    0x60
++
++    PTR_ADDI     S1,     S1,    0x40
++    PTR_ADDI     S2,     S2,    0x40
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M2_N16
++.L_M2_N15:
++    andi       I,      N,     0x08
++    beq        ZERO,   I,     .L_M2_N7
++.L_M2_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++
++    GST xv, , U0, P2, 0x00, U1, P2, 0x20
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     P2,     P2,    0x40
++.L_M2_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M2_N3
++.L_M2_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
++    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     P3,     P3,    0x20
++.L_M2_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M2_N1
++.L_M2_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
++    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     P4,     P4,    0x10
++.L_M2_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_M1
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
++    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     P5,     P5,    0x08
++.L_M1:
++    andi       J,      M,     0x01
++    beq        ZERO,   J,     .L_M0
++
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x40
++
++    PTR_SRAI     I,      N,     0x04
++    beq          ZERO,   I,     .L_M1_N15
++.align 5
++.L_M1_N16:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S1,    0x20
++
++    xvst       U0,     P1,    0x00
++    xvst       U1,     P1,    0x20
++
++    PTR_ADDI     S1,     S1,    0x40
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M1_N16
++.L_M1_N15:
++    andi       I,      N,     0x08
++    beq        ZERO,   I,     .L_M1_N7
++.L_M1_N8:
++    xvld       U0,     S1,    0x00
++
++    GST xv, , U0, P2, 0x00
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     P2,     P2,    0x20
++.L_M1_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M1_N3
++.L_M1_N4:
++    GLD v, , $vr0, S1, 0x00
++    GST v, , $vr0, P3, 0x00
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     P3,     P3,    0x10
++.L_M1_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M1_N1
++.L_M1_N2:
++    GLD f, d, $f0, S1, 0x00
++    GST f, d, $f0, P4, 0x00
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     P4,     P4,    0x08
++.L_M1_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_M0
++
++    GLD f, s, $f0, S1, 0x00
++    GST f, s, $f0, P5, 0x00
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     P5,     P5,    0x04
++.L_M0:
++    pop_if_used 24, 8
++    jirl       $r0,    $r1,   0x00
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
+new file mode 100644
+index 000000000..725a47a60
+--- /dev/null
++++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
+@@ -0,0 +1,406 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/23 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*********************************************************************/
++
++/* Function parameters */
++#define M      $r4    // param 1: m
++#define N      $r5    // param 2: n
++#define SRC    $r6    // param 3: src
++#define LDA    $r7    // param 4: lda
++#define DST    $r8    // param 5: dst
++
++#define I      $r9
++#define J      $r10
++#define S0     $r11
++#define S1     $r12
++#define S2     $r13
++#define S3     $r14
++#define S4     $r15
++#define S5     $r16
++#define S6     $r17
++#define S7     $r18
++#define S8     $r19
++#define P0     $r20
++#define P1     $r23
++#define P2     $r24
++#define P3     $r25
++#define P4     $r26
++#define T0     $r27
++#define T1     $r28
++#define TL     $r7
++#undef  ZERO
++#define ZERO   $r0
++
++/* LASX vectors */
++#define U0     $xr0
++#define U1     $xr1
++#define U2     $xr2
++#define U3     $xr3
++#define U4     $xr4
++#define U5     $xr5
++#define U6     $xr6
++#define U7     $xr7
++
++// Loops outline
++//.L_M8 <-------------------
++//|   .L_N8:                |
++//|   .L_N7:                | Main Loop
++//|   .L_N4:                |
++//|   .L_N3:                |
++//|   .L_N2:                |
++//|   .L_N1:                |
++//|   .L_N0: ---------------
++//.L_M7
++//.L_M4
++//|   .L_M4_N8:
++//|   .L_M4_N7:
++//|   .L_M4_N4:
++//|   .L_M4_N3:
++//|   .L_M4_N2:
++//|   .L_M4_N1:
++//.L_M3
++//.L_M2
++//|   .L_M2_N8:
++//|   .L_M2_N7:
++//|   .L_M2_N4:
++//|   .L_M2_N3:
++//|   .L_M2_N2:
++//|   .L_M2_N1:
++//.L_M1
++//|   .L_M1_N8:
++//|   .L_M1_N7:
++//|   .L_M1_N4:
++//|   .L_M1_N3:
++//|   .L_M1_N2:
++//|   .L_M1_N1:
++//.L_M0
++
++    PROLOGUE
++    push_if_used 23, 8
++
++    move       S0,     SRC
++    move       P0,     DST
++
++    PTR_SRAI     T0,     N,     0x04
++    PTR_SRAI     T1,     N,     0x03
++    PTR_SLLI     T0,     T0,    0x04
++    PTR_SLLI     T1,     T1,    0x03
++
++    PTR_MUL      P2,     M,     T1
++    PTR_SLLI     P2,     P2,    0x02
++    PTR_ADD      P2,     DST,   P2
++    PTR_SRAI     T0,     N,     0x02
++    PTR_SRAI     T1,     N,     0x01
++    PTR_SLLI     T0,     T0,    0x02
++    PTR_SLLI     T1,     T1,    0x01
++    PTR_MUL      P3,     M,     T0
++    PTR_MUL      P4,     M,     T1
++    PTR_SLLI     P3,     P3,    0x02
++    PTR_SLLI     P4,     P4,    0x02
++    PTR_ADD      P3,     DST,   P3
++    PTR_ADD      P4,     DST,   P4
++
++    PTR_SLLI     TL,     LDA,   0x02
++    PTR_SRAI     J,      M,     0x03
++    PTR_SLLI     T0,     TL,    0x01
++    PTR_SLLI     T1,     M,     0x05
++    beq          ZERO,   J,     .L_M7
++.align 5
++.L_M8:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S3,     S1,    T0
++    PTR_ADD      S4,     S2,    T0
++    PTR_ADD      S5,     S3,    T0
++    PTR_ADD      S6,     S4,    T0
++    PTR_ADD      S7,     S5,    T0
++    PTR_ADD      S8,     S6,    T0
++    PTR_ADD      S0,     S7,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x100
++
++    PTR_SRAI     I,      N,     0x03
++    PTR_ADDI     J,      J,     -1
++    beq          ZERO,   I,     .L_N7
++.L_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++    xvld       U2,     S3,    0x00
++    xvld       U3,     S4,    0x00
++    xvld       U4,     S5,    0x00
++    xvld       U5,     S6,    0x00
++    xvld       U6,     S7,    0x00
++    xvld       U7,     S8,    0x00
++
++    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
++              U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     S3,     S3,    0x20
++    PTR_ADDI     S4,     S4,    0x20
++    PTR_ADDI     S5,     S5,    0x20
++    PTR_ADDI     S6,     S6,    0x20
++    PTR_ADDI     S7,     S7,    0x20
++    PTR_ADDI     S8,     S8,    0x20
++
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_N8
++.L_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_N3
++.L_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
++             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
++    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
++             $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     S3,     S3,    0x10
++    PTR_ADDI     S4,     S4,    0x10
++    PTR_ADDI     S5,     S5,    0x10
++    PTR_ADDI     S6,     S6,    0x10
++    PTR_ADDI     S7,     S7,    0x10
++    PTR_ADDI     S8,     S8,    0x10
++    PTR_ADDI     P2,     P2,    0x80
++.L_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_N1
++.L_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
++              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
++    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
++              $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     S3,     S3,    0x08
++    PTR_ADDI     S4,     S4,    0x08
++    PTR_ADDI     S5,     S5,    0x08
++    PTR_ADDI     S6,     S6,    0x08
++    PTR_ADDI     S7,     S7,    0x08
++    PTR_ADDI     S8,     S8,    0x08
++    PTR_ADDI     P3,     P3,    0x40
++.L_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_N0
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
++              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
++    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
++              $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     S3,     S3,    0x04
++    PTR_ADDI     S4,     S4,    0x04
++    PTR_ADDI     S5,     S5,    0x04
++    PTR_ADDI     S6,     S6,    0x04
++    PTR_ADDI     S7,     S7,    0x04
++    PTR_ADDI     S8,     S8,    0x04
++    PTR_ADDI     P4,     P4,    0x20
++.L_N0:
++    blt        ZERO,   J,     .L_M8
++
++.L_M7:
++    andi       J,      M,     0x04
++    beq        ZERO,   J,     .L_M3
++.L_M4:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S3,     S1,    T0
++    PTR_ADD      S4,     S2,    T0
++    PTR_ADD      S0,     S3,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x80
++
++    PTR_SRAI     I,      N,     0x03
++    beq          ZERO,   I,     .L_M4_N7
++.align 5
++.L_M4_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++    xvld       U2,     S3,    0x00
++    xvld       U3,     S4,    0x00
++
++    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     S3,     S3,    0x20
++    PTR_ADDI     S4,     S4,    0x20
++
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M4_N8
++.L_M4_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M4_N3
++.L_M4_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
++    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     S3,     S3,    0x10
++    PTR_ADDI     S4,     S4,    0x10
++    PTR_ADDI     P2,     P2,    0x40
++.L_M4_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M4_N1
++.L_M4_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
++    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     S3,     S3,    0x08
++    PTR_ADDI     S4,     S4,    0x08
++    PTR_ADDI     P3,     P3,    0x20
++.L_M4_N1:
++    andi        I,      N,     0x01
++    beq         ZERO,   I,     .L_M3
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
++    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     S3,     S3,    0x04
++    PTR_ADDI     S4,     S4,    0x04
++    PTR_ADDI     P4,     P4,    0x10
++.L_M3:
++    andi       J,      M,     0x02
++    beq        ZERO,   J,     .L_M1
++.L_M2:
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++    PTR_ADD      S0,     S0,    T0
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x40
++
++    PTR_SRAI     I,      N,     0x03
++    beq          ZERO,   I,     .L_M2_N7
++.align 5
++.L_M2_N8:
++    xvld       U0,     S1,    0x00
++    xvld       U1,     S2,    0x00
++
++    GST xv, , U0, P1, 0x00, U1, P1, 0x20
++
++    PTR_ADDI     S1,     S1,    0x20
++    PTR_ADDI     S2,     S2,    0x20
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M2_N8
++.L_M2_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M2_N3
++.L_M2_N4:
++    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
++    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     S2,     S2,    0x10
++    PTR_ADDI     P2,     P2,    0x20
++.L_M2_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M2_N1
++.L_M2_N2:
++    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
++    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     S2,     S2,    0x08
++    PTR_ADDI     P3,     P3,    0x10
++.L_M2_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_M1
++
++    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
++    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     S2,     S2,    0x04
++    PTR_ADDI     P4,     P4,    0x08
++.L_M1:
++    andi       J,      M,     0x01
++    beq        ZERO,   J,     .L_M0
++
++    move         S1,     S0
++    PTR_ADD      S2,     S0,    TL
++
++    move         P1,     P0
++    PTR_ADDI     P0,     P0,    0x20
++
++    PTR_SRAI     I,      N,     0x03
++    beq          ZERO,   I,     .L_M1_N7
++.align 5
++.L_M1_N8:
++    xvld       U0,     S1,    0x00
++
++    GST xv, , U0, P1, 0x00
++
++    PTR_ADDI     S1,     S1,    0x20
++
++    PTR_ADDI     I,      I,     -1
++    PTR_ADD      P1,     P1,    T1
++    blt          ZERO,   I,     .L_M1_N8
++.L_M1_N7:
++    andi       I,      N,     0x04
++    beq        ZERO,   I,     .L_M1_N3
++.L_M1_N4:
++    GLD v, , $vr0, S1, 0x00
++    GST v, , $vr0, P2, 0x00
++    PTR_ADDI     S1,     S1,    0x10
++    PTR_ADDI     P2,     P2,    0x10
++.L_M1_N3:
++    andi       I,      N,     0x02
++    beq        ZERO,   I,     .L_M1_N1
++.L_M1_N2:
++    GLD f, d, $f0, S1, 0x00
++    GST f, d, $f0, P3, 0x00
++    PTR_ADDI     S1,     S1,    0x08
++    PTR_ADDI     P3,     P3,    0x08
++.L_M1_N1:
++    andi       I,      N,     0x01
++    beq        ZERO,   I,     .L_M0
++
++    GLD f, s, $f0, S1, 0x00
++    GST f, s, $f0, P4, 0x00
++    PTR_ADDI     S1,     S1,    0x04
++    PTR_ADDI     P4,     P4,    0x04
++.L_M0:
++    pop_if_used 23, 8
++    jirl       $r0,    $r1,   0x00
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S
+new file mode 100644
+index 000000000..52ffc320e
+--- /dev/null
++++ b/kernel/loongarch64/sgemv_n_8_lasx.S
+@@ -0,0 +1,463 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/30 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
++ */
++#define M       $r4
++#define N       $r5
++#define ALPHA   $f0
++#define A       $r7
++#define LDA     $r8
++#define X       $r9
++#define INC_X   $r10
++#define Y       $r11
++#define INC_Y   $r6
++
++#define J       $r12
++#define I       $r13
++#define K       $r14
++#define Y_ORG   $r15
++#define OFFSET  $r16
++#define K_LDA   $r17
++#define M4      $r18
++#define T0      $r19
++#define PA0     $r20
++#define PA1     $r23
++#define PA2     $r24
++#define PA3     $r25
++#define PA4     $r26
++#define PA5     $r27
++#define PA6     $r28
++#define PA7     $r29
++
++#define VALPHA  $xr1
++#define X0      $xr2
++#define X1      $xr3
++#define X2      $xr4
++#define X3      $xr5
++#define X4      $xr6
++#define X5      $xr7
++#define X6      $xr8
++#define X7      $xr9
++#define Y0      $xr10
++#define A0      $xr11
++#define A1      $xr12
++#define A2      $xr13
++#define A3      $xr14
++#define A4      $xr15
++#define A5      $xr16
++#define A6      $xr17
++#define A7      $xr18
++
++#define X0_F    $f2
++#define X1_F    $f3
++#define X2_F    $f4
++#define X3_F    $f5
++#define X4_F    $f6
++#define X5_F    $f7
++#define X6_F    $f8
++#define X7_F    $f9
++#define Y0_F    $f10
++#define A0_F    $f11
++#define A1_F    $f12
++#define A2_F    $f13
++#define A3_F    $f14
++#define A4_F    $f15
++#define A5_F    $f16
++#define A6_F    $f17
++#define A7_F    $f18
++
++.macro SLOAD_X_8
++    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \
++                   X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
++                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
++.endm
++
++.macro SLOAD_X_8_GAP
++    xvldrepl.w  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.w  X1,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X2,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X3,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X4,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X5,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X6,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X7,     T0,     0x00
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
++                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
++.endm
++
++.macro SLOAD_X_4
++    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
++.endm
++
++.macro SLOAD_X_4_GAP
++    xvldrepl.w  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.w  X1,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X2,     T0,     0x00
++    PTR_ADD     T0,     T0,     INC_X
++    xvldrepl.w  X3,     T0,     0x00
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
++.endm
++
++.macro SLOAD_X_2
++    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
++.endm
++
++.macro SLOAD_X_2_GAP
++    xvldrepl.w  X0,     X,      0x00
++    PTR_ADD     T0,     X,      INC_X
++    xvldrepl.w  X1,     T0,     0x00
++    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
++.endm
++
++.macro SLOAD_X_1
++    GLDREPL xv, w, X0, X, 0x00
++    GMUL xvf, s, X0, X0, VALPHA
++.endm
++
++.macro SLOAD_Y_8
++    GLD xv, , Y0, Y, 0
++.endm
++
++.macro SLOAD_Y_8_GAP
++    fld.s   Y0_F,   Y,  0
++    fldx.s  A0_F,   Y,  INC_Y
++    PTR_ALSL  T0,   INC_Y,  Y,  1
++    fld.s   A1_F,   T0, 0
++    fldx.s  A2_F,   T0, INC_Y
++    PTR_ALSL  T0,   INC_Y,  Y,  2
++    fld.s   A3_F,   T0, 0
++    fldx.s  A4_F,   T0, INC_Y
++    PTR_ADD   T0,   T0, INC_Y
++    PTR_ADD   T0,   T0, INC_Y
++    fld.s   A5_F,   T0, 0
++    fldx.s  A6_F,   T0, INC_Y
++    GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \
++                   Y0, A4, 5, Y0, A5, 6, Y0, A6, 7
++.endm
++
++.macro SLOAD_Y_1
++    GLD f, s, Y0_F, Y, 0
++.endm
++
++.macro SGEMV_N_8x8
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0, \
++            A2, PA2, 0, A3, PA3, 0, \
++            A4, PA4, 0, A5, PA5, 0, \
++            A6, PA6, 0, A7, PA7, 0
++    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
++                  Y0, A2, X2, Y0, Y0, A3, X3, Y0, \
++                  Y0, A4, X4, Y0, Y0, A5, X5, Y0, \
++                  Y0, A6, X6, Y0, Y0, A7, X7, Y0
++.endm
++
++.macro SGEMV_N_1x8
++    GLD_INC f, s, 0x04,                 \
++            A0_F, PA0, 0, A1_F, PA1, 0, \
++            A2_F, PA2, 0, A3_F, PA3, 0, \
++            A4_F, PA4, 0, A5_F, PA5, 0, \
++            A6_F, PA6, 0, A7_F, PA7, 0
++    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
++                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \
++                Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \
++                Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F
++.endm
++
++.macro SGEMV_N_8x4
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0, \
++            A2, PA2, 0, A3, PA3, 0
++    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
++                  Y0, A2, X2, Y0, Y0, A3, X3, Y0
++.endm
++
++.macro SGEMV_N_1x4
++    GLD_INC f, s, 0x04,                 \
++            A0_F, PA0, 0, A1_F, PA1, 0, \
++            A2_F, PA2, 0, A3_F, PA3, 0
++    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
++                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F
++.endm
++
++.macro SGEMV_N_8x2
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0
++    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0
++.endm
++
++.macro SGEMV_N_1x2
++    GLD_INC f, s, 0x04,             \
++            A0_F, PA0, 0, A1_F, PA1, 0
++    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F
++.endm
++
++.macro SGEMV_N_1x1
++    GLD_INC f, s, 0x04, A0_F, PA0, 0
++    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F
++.endm
++
++.macro SSTORE_Y_8
++    GST xv, , Y0, Y, 0
++.endm
++
++.macro SSTORE_Y_8_GAP
++    xvstelm.w   Y0,     Y,      0,      0
++    PTR_ADD     T0,     Y,      INC_Y
++    xvstelm.w   Y0,     T0,     0,      1
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      2
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      3
++
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      4
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      5
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      6
++    PTR_ADD     T0,     T0,     INC_Y
++    xvstelm.w   Y0,     T0,     0,      7
++.endm
++
++.macro SSTORE_Y_1
++    GST f, s, Y0_F, Y, 0
++.endm
++
++.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
++    PTR_SRLI  J,      N,      3
++    beqz      J,      .L_\XW\()_N_7
++    PTR_SLLI  K_LDA,  LDA,    3
++    PTR_SUB   K_LDA,  K_LDA,  M4
++.L_\XW\()_N_L8:
++    SLOAD_\X_8
++    xor     K,      K,      K
++    move    Y,      Y_ORG
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_M_7
++.align 5
++.L_\XW\()_M_L8:
++    SLOAD_\Y_8
++    SGEMV_N_8x8
++    SSTORE_\Y_8
++    PTR_ADDI    I,      I,      -1
++    PTR_ALSL    Y,      INC_Y,  Y,  3
++    PTR_ADDI    K,      K,      8
++    bnez        I,      .L_\XW\()_M_L8
++.L_\XW\()_M_7:
++    andi        I,      M,      7
++    beqz        I,      .L_\XW\()_M_END
++.align 5
++.L_\XW\()_M_L1:
++    SLOAD_\Y_1
++    SGEMV_N_1x8
++    SSTORE_\Y_1
++    PTR_ADDI    I,      I,      -1
++    PTR_ADD     Y,      Y,      INC_Y
++    PTR_ADDI    K,      K,      1
++    bnez        I,      .L_\XW\()_M_L1
++.L_\XW\()_M_END:
++    PTR_ADDI    J,      J,      -1
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#endif
++    PTR_ALSL    X,      INC_X,  X,  3
++    bnez        J,      .L_\XW\()_N_L8
++.L_\XW\()_N_7:
++    andi        J,      N,      4
++    beqz        J,      .L_\XW\()_N_3
++    SLOAD_\X_4
++    xor         K,      K,      K
++    move        Y,      Y_ORG
++
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_N_4_M_7
++.align 5
++.L_\XW\()_N_4_M_L8:
++    SLOAD_\Y_8
++    SGEMV_N_8x4
++    SSTORE_\Y_8
++    PTR_ADDI  I,      I,      -1
++    PTR_ADDI  K,      K,      8
++    PTR_ALSL  Y,      INC_Y,  Y,  3
++    bnez    I,      .L_\XW\()_N_4_M_L8
++.L_\XW\()_N_4_M_7:
++    andi        I,      M,      7
++    beqz        I,      .L_\XW\()_N_4_M_END
++.align 5
++.L_\XW\()_N_4_M_L1:
++    SLOAD_\Y_1
++    SGEMV_N_1x4
++    SSTORE_\Y_1
++    PTR_ADDI    I,      I,      -1
++    PTR_ADD     Y,      Y,      INC_Y
++    PTR_ADDI    K,      K,      1
++    bnez        I,      .L_\XW\()_N_4_M_L1
++.L_\XW\()_N_4_M_END:
++    PTR_SLLI    K_LDA,  LDA,    2
++    PTR_SUB     K_LDA,  K_LDA,  M4
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#endif
++    PTR_ALSL    X,      INC_X,  X,  2
++.L_\XW\()_N_3:
++    andi        J,      N,      2
++    beqz        J,      .L_\XW\()_N_1
++    SLOAD_\X_2
++    xor         K,      K,      K
++    move        Y,      Y_ORG
++    PTR_SRLI    I,      M,       3
++    beqz    I,      .L_\XW\()_N_2_M_7
++.align 5
++.L_\XW\()_N_2_M_L8:
++    SLOAD_\Y_8
++    SGEMV_N_8x2
++    SSTORE_\Y_8
++    PTR_ADDI  I,      I,      -1
++    PTR_ADDI  K,      K,      8
++    PTR_ALSL  Y,      INC_Y,  Y,  3
++    bnez    I,      .L_\XW\()_N_2_M_L8
++.L_\XW\()_N_2_M_7:
++    andi    I,      M,      7
++    beqz    I,      .L_\XW\()_N_2_M_END
++.align 5
++.L_\XW\()_N_2_M_L1:
++    SLOAD_\Y_1
++    SGEMV_N_1x2
++    SSTORE_\Y_1
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   Y,      Y,      INC_Y
++    PTR_ADDI  K,      K,      1
++    bnez    I,      .L_\XW\()_N_2_M_L1
++.L_\XW\()_N_2_M_END:
++    PTR_SLLI    K_LDA,  LDA,    1
++    PTR_SUB     K_LDA,  K_LDA,  M4
++    PTR_ADD     PA0,    PA0,    K_LDA
++    PTR_ADD     PA1,    PA1,    K_LDA
++    PTR_ALSL    X,      INC_X,  X,  1
++.L_\XW\()_N_1:
++    andi    J,      N,      1
++    beqz    J,      .L_END
++    SLOAD_\X_1
++    xor     K,      K,      K
++    move    Y,      Y_ORG
++    move    I,      M
++    beqz    I,      .L_END
++.align 5
++.L_\XW\()_N_1_M_L1:
++    SLOAD_\Y_1
++    SGEMV_N_1x1
++    SSTORE_\Y_1
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   Y,      Y,      INC_Y
++    PTR_ADDI  K,      K,      1
++    bnez    I,      .L_\XW\()_N_1_M_L1
++    b .L_END
++.endm
++
++    PROLOGUE
++    PTR_LD     INC_Y,  $sp,    0
++    push_if_used 17 + 7, 19
++    PTR_ADDI   K,      $r0,     0x01
++    PTR_SUB    I,      INC_X,   K
++    PTR_SUB    J,      INC_Y,   K
++    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
++    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
++    PTR_ALSL   I,      I,       J,      1
++    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
++    xvreplve0.w     VALPHA, $xr0
++    move     Y_ORG,  Y
++    move     PA0,    A
++#if __loongarch_grlen == 64
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#else
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#endif
++    la.local    T0,     .L_GAP_TABLE
++    PTR_ALSL    I,      I,      T0,     1
++    ld.h        K,      I,      0
++    PTR_ADD     T0,     T0,     K
++    jirl        $r0,    T0,     0
++.L_GAP_TABLE:
++    .hword  .L_GAP_0_0 - .L_GAP_TABLE
++    .hword  .L_GAP_0_1 - .L_GAP_TABLE
++    .hword  .L_GAP_1_0 - .L_GAP_TABLE
++    .hword  .L_GAP_1_1 - .L_GAP_TABLE
++.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
++    SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
++.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
++    SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
++.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
++    SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
++.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
++    SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
++.L_END:
++    pop_if_used 17 + 7, 19
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S
+new file mode 100644
+index 000000000..f4bfffb42
+--- /dev/null
++++ b/kernel/loongarch64/sgemv_t_8_lasx.S
+@@ -0,0 +1,405 @@
++/*******************************************************************************
++Copyright (c) 2023, The OpenBLAS Project
++All rights reserved.
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are
++met:
++1. Redistributions of source code must retain the above copyright
++notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++notice, this list of conditions and the following disclaimer in
++the documentation and/or other materials provided with the
++distribution.
++3. Neither the name of the OpenBLAS project nor the names of
++its contributors may be used to endorse or promote products
++derived from this software without specific prior written permission.
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*******************************************************************************/
++#define ASSEMBLER
++
++#include "common.h"
++#include "loongarch64_asm.S"
++
++/*********************************************************************
++* 2023/08/30 guxiwei
++*        UTEST                  : OK
++*        CTEST                  : OK
++*        TEST                   : OK
++*
++*
++*********************************************************************/
++
++/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
++ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
++ */
++#define M       $r4
++#define N       $r5
++#define ALPHA   $f0
++#define A       $r7
++#define LDA     $r8
++#define X       $r9
++#define INC_X   $r10
++#define Y       $r11
++#define INC_Y   $r6
++
++#define J       $r12
++#define I       $r13
++#define K       $r14
++#define PY0     $r14
++#define X_ORG   $r15
++#define PY1     $r16
++#define K_LDA   $r17
++#define PY2     $r18
++#define T0      $r19
++#define PA0     $r20
++#define PA1     $r23
++#define PA2     $r24
++#define PA3     $r25
++#define PA4     $r26
++#define PA5     $r27
++#define PA6     $r28
++#define PA7     $r29
++#define M4      $r30
++
++#define VALPHA  $xr0
++#define X0      $xr1
++#define A0      $xr2
++#define A1      $xr3
++#define A2      $xr4
++#define A3      $xr5
++#define A4      $xr6
++#define A5      $xr7
++#define A6      $xr8
++#define A7      $xr9
++#define TP0     $xr10
++#define TP1     $xr11
++#define TP2     $xr12
++#define TP3     $xr13
++#define TP4     $xr14
++#define TP5     $xr15
++#define TP6     $xr16
++#define TP7     $xr17
++#define Y0      $xr2
++#define Y1      $xr3
++#define Y2      $xr4
++#define Y3      $xr5
++#define Y4      $xr6
++#define Y5      $xr7
++#define Y6      $xr8
++#define Y7      $xr9
++
++.macro ZERO_Y8
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
++                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
++.endm
++
++.macro ZERO_Y4
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
++.endm
++
++.macro ZERO_Y2
++    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
++.endm
++
++.macro ZERO_Y1
++    GXOR xv, v, TP0, TP0, TP0
++.endm
++
++.macro SLOAD_X8
++    GLD xv, , X0, X, 0x00
++.endm
++
++.macro SLOAD_X8_GAP
++    fld.s       $f1,    X,    0x00
++    fldx.s      $f2,    X,    INC_X
++    PTR_ALSL    T0,     INC_X,      X,      1
++    fld.s       $f3,    T0,   0x00
++    fldx.s      $f4,    T0,   INC_X
++    GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3
++    PTR_ALSL    T0,     INC_X,      X,      2
++    fld.s       $f2,    T0,   0x00
++    fldx.s      $f3,    T0,   INC_X
++    PTR_ALSL    T0,     INC_X,      T0,     1
++    fld.s       $f4,    T0,   0x00
++    fldx.s      $f5,    T0,   INC_X
++    GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7
++.endm
++
++.macro SGEMV_T_8x8
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0, \
++            A2, PA2, 0, A3, PA3, 0, \
++            A4, PA4, 0, A5, PA5, 0, \
++            A6, PA6, 0, A7, PA7, 0
++    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
++                  TP2, A2, X0, TP2, TP3, A3, X0, TP3, \
++                  TP4, A4, X0, TP4, TP5, A5, X0, TP5, \
++                  TP6, A6, X0, TP6, TP7, A7, X0, TP7
++.endm
++
++.macro SGEMV_T_4x8
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0, \
++            A2, PA2, 0, A3, PA3, 0
++    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
++                  TP2, A2, X0, TP2, TP3, A3, X0, TP3
++.endm
++
++.macro SGEMV_T_2x8
++    GLD_INC xv, , 0x20,             \
++            A0, PA0, 0, A1, PA1, 0
++    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1
++.endm
++
++.macro SGEMV_T_LASX XW:req X8:req, X4:req
++    PTR_SRLI  J,      N,      3
++    beqz      J,      .L_\XW\()_N_7
++    PTR_SLLI  K_LDA,  LDA,    3
++    PTR_SUB   K_LDA,  K_LDA,  M4
++.L_\XW\()_N_L8:
++    ZERO_Y8
++    move      X,      X_ORG
++    PTR_SRLI  I,      M,       3
++    beqz      I,      .L_\XW\()_M_7
++.align 5
++.L_\XW\()_M_L8:
++    SLOAD_\X8
++    SGEMV_T_8x8
++    PTR_ADDI    I,  I,  -1
++    PTR_ALSL    X,  INC_X,  X,  3
++    bnez        I,  .L_\XW\()_M_L8
++.L_\XW\()_M_7:
++    // Accumulated
++    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
++                 Y5, TP5, Y6, TP6, Y7, TP7
++    andi        I,      M,      7
++    beqz        I,      .L_\XW\()_M_END
++.align 5
++.L_\XW\()_M_L1:
++    fld.s   $f1,    X,      0x00
++    fld.s   $f10,   PA0,    0x00
++    fld.s   $f11,   PA1,    0x00
++    fld.s   $f12,   PA2,    0x00
++    fld.s   $f13,   PA3,    0x00
++    fld.s   $f14,   PA4,    0x00
++    fld.s   $f15,   PA5,    0x00
++    fld.s   $f16,   PA6,    0x00
++    fld.s   $f17,   PA7,    0x00
++#if __loongarch_grlen == 64
++    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
++               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
++#elif __loongarch_grlen == 32
++    GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
++               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
++#else
++    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
++               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
++#endif
++    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \
++                $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9,
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_M_L1
++.L_\XW\()_M_END:
++    fld.s   $f10,   Y,  0x00
++    fldx.s  $f11,   Y,  INC_Y
++    PTR_ALSL    PY0, INC_Y,  Y,  1
++    fld.s   $f12,   PY0,    0x00
++    fldx.s  $f13,   PY0,    INC_Y
++    PTR_ALSL    PY1, INC_Y,  Y,  2
++    fld.s   $f14,   PY1,    0x00
++    fldx.s  $f15,   PY1,    INC_Y
++    PTR_ALSL    PY2, INC_Y,  PY1, 1
++    fld.s   $f16,   PY2,    0x00
++    fldx.s  $f17,   PY2,    INC_Y
++
++    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \
++                $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17
++
++    PTR_ADDI    J,      J,      -1
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
++              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
++#endif
++    fst.s   $f10,   Y,      0x00
++    fstx.s  $f11,   Y,      INC_Y
++    fst.s   $f12,   PY0,    0x00
++    fstx.s  $f13,   PY0,    INC_Y
++    fst.s   $f14,   PY1,    0x00
++    fstx.s  $f15,   PY1,    INC_Y
++    fst.s   $f16,   PY2,    0x00
++    fstx.s  $f17,   PY2,    INC_Y
++
++    PTR_ALSL    Y,      INC_Y,  Y,  3
++    bnez        J,      .L_\XW\()_N_L8
++.L_\XW\()_N_7:
++    andi        J,      N,      4
++    beqz        J,      .L_\XW\()_N_3
++    ZERO_Y4
++    move        X,      X_ORG
++    PTR_SRLI    I,      M,       3
++    beqz        I,      .L_\XW\()_N_4_M_7
++.align 5
++.L_\XW\()_N_4_M_L8:
++    SLOAD_\X8
++    SGEMV_T_4x8
++    PTR_ADDI  I,      I,      -1
++    PTR_ALSL  X,      INC_X,  X,  3
++    bnez      I,      .L_\XW\()_N_4_M_L8
++.L_\XW\()_N_4_M_7:
++    // Accumulated
++    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
++    andi        I,      M,      7
++    beqz        I,      .L_\XW\()_N_4_M_END
++.align 5
++.L_\XW\()_N_4_M_L1:
++    fld.s   $f1,    X,      0x00
++    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00
++    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_N_4_M_L1
++.L_\XW\()_N_4_M_END:
++    fld.s   $f10,   Y,  0x00
++    fldx.s  $f11,   Y,  INC_Y
++    PTR_ALSL    PY0, INC_Y,  Y,  1
++    fld.s   $f12,   PY0,    0x00
++    fldx.s  $f13,   PY0,    INC_Y
++
++    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13
++
++    PTR_SLLI    K_LDA,  LDA,    2
++    PTR_SUB     K_LDA,  K_LDA,  M4
++
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
++#endif
++    fst.s   $f10,   Y,      0x00
++    fstx.s  $f11,   Y,      INC_Y
++    fst.s   $f12,   PY0,    0x00
++    fstx.s  $f13,   PY0,    INC_Y
++    PTR_ALSL    Y,      INC_Y,  Y,  2
++.L_\XW\()_N_3:
++    andi        J,      N,      2
++    beqz        J,      .L_\XW\()_N_1
++    ZERO_Y2
++    move        X,      X_ORG
++    PTR_SRLI    I,      M,       3
++    beqz        I,      .L_\XW\()_N_2_M_7
++.align 5
++.L_\XW\()_N_2_M_L8:
++    SLOAD_\X8
++    SGEMV_T_2x8
++    PTR_ADDI  I,      I,      -1
++    PTR_ALSL  X,      INC_X,  X,  3
++    bnez      I,      .L_\XW\()_N_2_M_L8
++.L_\XW\()_N_2_M_7:
++    // Accumulated
++    GACC xvf, s, Y0, TP0, Y1, TP1
++    andi        I,      M,      7
++    beqz        I,      .L_\XW\()_N_2_M_END
++.align 5
++.L_\XW\()_N_2_M_L1:
++    fld.s   $f1,    X,      0x00
++    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00
++    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    bnez      I,      .L_\XW\()_N_2_M_L1
++.L_\XW\()_N_2_M_END:
++    fld.s   $f10,   Y,  0x00
++    fldx.s  $f11,   Y,  INC_Y
++
++    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11
++
++    PTR_SLLI    K_LDA,  LDA,    1
++    PTR_SUB     K_LDA,  K_LDA,  M4
++
++#if __loongarch_grlen == 64
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#else
++    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
++#endif
++    fst.s   $f10,   Y,      0x00
++    fstx.s  $f11,   Y,      INC_Y
++    PTR_ALSL        Y,      INC_Y,  Y,  1
++.L_\XW\()_N_1:
++    andi    J,      N,      1
++    beqz    J,      .L_END
++    ZERO_Y1
++    move    X,      X_ORG
++    move    I,      M
++    beqz    I,      .L_END
++.align 5
++.L_\XW\()_N_1_M_L1:
++    fld.s   $f2,    PA0,    0x00
++    fld.s   $f1,    X,      0x00
++    fmadd.s $f10,   $f2,    $f1,    $f10
++    PTR_ADDI  I,      I,      -1
++    PTR_ADD   X,      X,      INC_X
++    PTR_ADDI  PA0,    PA0,    0x04
++    bnez      I,      .L_\XW\()_N_1_M_L1
++
++    fld.s     $f2,    Y,      0x00
++    fmadd.s   $f2,    ALPHA,  $f10,  $f2
++    fst.s     $f2,    Y,      0x00
++    b .L_END
++.endm
++
++    PROLOGUE
++    PTR_LD     INC_Y,  $sp,    0
++    push_if_used 17 + 8, 18
++    PTR_ADDI   K,      $r0,     0x01
++    PTR_SUB    I,      INC_X,   K
++    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
++    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
++    xvreplve0.w     VALPHA, $xr0
++    move     X_ORG,  X
++    move     PA0,    A
++#if __loongarch_grlen == 64
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#elif __loongarch_grlen == 32
++    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#else
++    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
++              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
++#endif
++    la.local    T0,     .L_GAP_TABLE
++    PTR_ALSL    I,      I,      T0,     1
++    ld.h        K,      I,      0
++    PTR_ADD     T0,     T0,     K
++    jirl        $r0,    T0,     0
++.L_GAP_TABLE:
++    .hword .L_GAP_0 - .L_GAP_TABLE
++    .hword .L_GAP_1 - .L_GAP_TABLE
++.L_GAP_0: /* if (incx == 1) */
++    SGEMV_T_LASX GAP_0, X8, X4
++.L_GAP_1: /* if (incx != 1) */
++    SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
++.L_END:
++    pop_if_used 17 + 8, 18
++    jirl    $r0, $r1, 0x0
++    EPILOGUE
+diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
+index 71e5a87cb..1c85667ec 100644
+--- a/lapack/laswp/loongarch64/Makefile
++++ b/lapack/laswp/loongarch64/Makefile
+@@ -1,6 +1,11 @@
+ TOPDIR	= ../../..
+ include ../../../Makefile.system
+ 
++ifeq ($(DYNAMIC_ARCH), 1)
++LASWP	= ../generic/laswp_k_4.c
++ZLASWP	= ../generic/zlaswp_k_4.c
++endif
++
+ ifndef LASWP
+ LASWP	= ../generic/laswp_k.c
+ endif
+diff --git a/param.h b/param.h
+index f1f5cbdad..a34e806c0 100644
+--- a/param.h
++++ b/param.h
+@@ -2845,31 +2845,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #define GEMM_DEFAULT_OFFSET_B 0
+ #define GEMM_DEFAULT_ALIGN 0x0ffffUL
+ 
++#if defined(NO_LASX)
++#define DGEMM_DEFAULT_UNROLL_N 8
++#define DGEMM_DEFAULT_UNROLL_M 2
+ #define SGEMM_DEFAULT_UNROLL_N 8
++#define SGEMM_DEFAULT_UNROLL_M 2
++#else
+ #define DGEMM_DEFAULT_UNROLL_N 4
++#define DGEMM_DEFAULT_UNROLL_M 16
++#define SGEMM_DEFAULT_UNROLL_N 8
++#define SGEMM_DEFAULT_UNROLL_M 16
++#endif
++
+ #define QGEMM_DEFAULT_UNROLL_N 2
+ #define CGEMM_DEFAULT_UNROLL_N 4
+ #define ZGEMM_DEFAULT_UNROLL_N 4
+ #define XGEMM_DEFAULT_UNROLL_N 1
+ 
+-#define SGEMM_DEFAULT_UNROLL_M 2
+-#define DGEMM_DEFAULT_UNROLL_M 16
+ #define QGEMM_DEFAULT_UNROLL_M 2
+ #define CGEMM_DEFAULT_UNROLL_M 1
+ #define ZGEMM_DEFAULT_UNROLL_M 1
+ #define XGEMM_DEFAULT_UNROLL_M 1
+ 
+-#define SGEMM_DEFAULT_P 512
++#define SGEMM_DEFAULT_P 256
+ #define DGEMM_DEFAULT_P 32
+ #define CGEMM_DEFAULT_P 128
+ #define ZGEMM_DEFAULT_P 128
+ 
+-#define SGEMM_DEFAULT_R 12288
++#define SGEMM_DEFAULT_R 1024
+ #define DGEMM_DEFAULT_R 858
+ #define CGEMM_DEFAULT_R 4096
+ #define ZGEMM_DEFAULT_R 4096
+ 
+-#define SGEMM_DEFAULT_Q 128
++#define SGEMM_DEFAULT_Q 256
+ #define DGEMM_DEFAULT_Q 152
+ #define CGEMM_DEFAULT_Q 128
+ #define ZGEMM_DEFAULT_Q 128
+-- 
+2.20.1
+
diff --git a/OpenBLAS-0.3.23.tar.gz b/OpenBLAS-0.3.23.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..31bec85b5297b69820ba189c120feebc730345dd
Binary files /dev/null and b/OpenBLAS-0.3.23.tar.gz differ
diff --git a/openblas.spec b/openblas.spec
new file mode 100644
index 0000000000000000000000000000000000000000..b201ae57aea0da4cba63e287043f3cd98b0d76c0
--- /dev/null
+++ b/openblas.spec
@@ -0,0 +1,540 @@
+%define anolis_release 2
+
+%bcond_with system_lapack
+%global lapackver 3.9.1
+
+Name:           openblas
+Summary:        An optimized BLAS library based on GotoBLAS2
+Version:        0.3.23
+Release:        %{anolis_release}%{?dist}
+License:        BSD
+URL:            https://github.com/xianyi/OpenBLAS/
+Source0:        https://github.com/xianyi/OpenBLAS/releases/download/v%{version}/OpenBLAS-%{version}.tar.gz
+Patch0001:      0001-openblas-0.2.15-system_lapack.patch
+Patch0002:      0002-openblas-0.2.5-libname.patch
+Patch0003:      0003-openblas-0.3.11-tests.patch
+Patch0004:      0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch
+
+BuildRequires:  make gcc gcc-c++ gcc-gfortran
+BuildRequires:  perl-devel
+Obsoletes:      %{name}-Rblas < %{EVR}
+
+%global execstack 1
+%if %{execstack}
+BuildRequires:  execstack
+%endif
+
+%if %{with system_lapack}
+BuildRequires:  lapack-static
+%global lapacke 1
+
+%else
+%global lapacke 1
+Provides:       bundled(lapack) = %{lapackver}
+%endif
+
+%global build64 1
+%bcond_without cpp_thread_check
+
+%if %{with system_lapack}
+%if %build64
+BuildRequires:  lapack64-static
+%endif
+%endif
+
+%description
+OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
+
+For a general introduction to the BLAS routines, please refer to the extensive documentation of
+their reference implementation hosted at netlib: https://www.netlib.org/blas. On that site you will
+likewise find documentation for the reference implementation of the higher-level library LAPACK -
+the Linear Algebra Package that comes included with OpenBLAS. If you are looking for a general primer
+or refresher on Linear Algebra, the set of six 20-minute lecture videos by Prof. Gilbert Strang on either
+MIT OpenCourseWare https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/ or
+Youtube https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek may be helpful.
+
+%package serial
+Summary:        An optimized BLAS library based on GotoBLAS2, serial version
+Requires:       %{name} = %{EVR}
+
+%description serial
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the sequential library compiled with a 32-bit
+integer interface.
+
+%package openmp
+Summary:        An optimized BLAS library based on GotoBLAS2, OpenMP version
+Requires:       %{name} = %{EVR}
+
+%description openmp
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with OpenMP support with
+32-bit integer interface.
+
+%package threads
+Summary:        An optimized BLAS library based on GotoBLAS2, pthreads version
+Requires:       %{name} = %{EVR}
+
+%description threads
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with threading support and
+a 32-bit integer interface.
+
+%if %build64
+%package serial64
+Summary:        An optimized BLAS library based on GotoBLAS2, serial version
+Requires:       %{name} = %{EVR}
+
+%description serial64
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the sequential library compiled with a 64-bit
+integer interface.
+
+%package openmp64
+Summary:        An optimized BLAS library based on GotoBLAS2, OpenMP version
+Requires:       %{name} = %{EVR}
+
+%description openmp64
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with OpenMP support and
+64-bit integer interface.
+
+%package threads64
+Summary:        An optimized BLAS library based on GotoBLAS2, pthreads version
+Requires:       %{name} = %{EVR}
+
+%description threads64
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with threading support and
+64-bit integer interface.
+
+%package serial64_
+Summary:        An optimized BLAS library based on GotoBLAS2, serial version
+Requires:       %{name} = %{EVR}
+
+%description serial64_
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the sequential library compiled with a 64-bit
+integer interface and a symbol name suffix.
+
+%package openmp64_
+Summary:        An optimized BLAS library based on GotoBLAS2, OpenMP version
+Requires:       %{name} = %{EVR}
+
+%description openmp64_
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with OpenMP support and
+64-bit integer interface and a symbol name suffix.
+
+%package threads64_
+Summary:        An optimized BLAS library based on GotoBLAS2, pthreads version
+Requires:       %{name} = %{EVR}
+
+%description threads64_
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the library compiled with threading support and
+64-bit integer interface and a symbol name suffix.
+%endif
+
+%package devel
+Summary:        Development headers and libraries for OpenBLAS
+Requires:       %{name} = %{EVR}
+Requires:       %{name}-serial = %{EVR}
+Requires:       %{name}-openmp = %{EVR}
+Requires:       %{name}-threads = %{EVR}
+%if %build64
+Requires:       %{name}-serial64 = %{EVR}
+Requires:       %{name}-serial64_ = %{EVR}
+Requires:       %{name}-openmp64 = %{EVR}
+Requires:       %{name}-threads64 = %{EVR}
+Requires:       %{name}-openmp64_ = %{EVR}
+Requires:       %{name}-threads64_ = %{EVR}
+%endif
+Requires:       %{name}-srpm-macros
+
+%description devel
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the development headers and libraries.
+
+%package static
+Summary:        Static version of OpenBLAS
+Requires:       %{name}-devel = %{EVR}
+
+%description static
+OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD \
+version. The project is supported by the Lab of Parallel Software and \
+Computational Science, ISCAS. http://www.rdcps.ac.cn
+
+This package contains the static libraries.
+
+%prep
+%setup -q -c -T
+
+tar zxf %{SOURCE0}
+cd OpenBLAS-%{version}
+%if %{with system_lapack}
+%patch0001 -p1 -b .system_lapack
+%endif
+%patch0002 -p1 -b .libname
+%patch0003 -p1 -b .tests
+%patch0004 -p1 -b .Add-opt-for-LoongArch64
+
+find -name \*.f -exec chmod 644 {} \;
+
+%if %{with system_lapack}
+rm -rf lapack-netlib
+%endif
+
+cd ..
+cp -ar OpenBLAS-%{version} openmp
+cp -ar OpenBLAS-%{version} threaded
+
+%if %build64
+for d in {serial,threaded,openmp}64{,_}; do
+    cp -ar OpenBLAS-%{version} $d
+done
+%endif
+
+mv OpenBLAS-%{version} serial
+
+%if %{with system_lapack}
+mkdir netliblapack
+cd netliblapack
+ar x %{_libdir}/liblapack_pic.a
+for f in laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs; do
+    \rm {c,d,s,z}$f.o
+done
+
+%if %{lapacke}
+ar x %{_libdir}/liblapacke.a
+%endif
+
+echo "TOPDIR = .." > Makefile
+echo "include ../Makefile.system" >> Makefile
+echo "COMMONOBJS = \\" >> Makefile
+for i in *.o; do
+ echo "$i \\" >> Makefile
+done
+echo -e "\n\ninclude \$(TOPDIR)/Makefile.tail" >> Makefile
+
+%if %{lapacke}
+# Copy include files
+cp -a %{_includedir}/lapacke .
+%endif
+cd ..
+
+for d in serial threaded openmp; do
+    cp -pr netliblapack $d
+done
+rm -rf netliblapack
+
+%if %build64
+mkdir netliblapack64
+cd netliblapack64
+ar x %{_libdir}/liblapack64_pic.a
+for f in laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs; do
+    \rm {c,d,s,z}$f.o
+done
+
+%if %{lapacke}
+ar x %{_libdir}/liblapacke.a
+%endif
+
+echo "TOPDIR = .." > Makefile
+echo "include ../Makefile.system" >> Makefile
+echo "COMMONOBJS = \\" >> Makefile
+for i in *.o; do
+    echo "$i \\" >> Makefile
+done
+echo -e "\n\ninclude \$(TOPDIR)/Makefile.tail" >> Makefile
+
+%if %{lapacke}
+# Copy include files
+cp -a %{_includedir}/lapacke .
+%endif
+
+cd ..
+
+for d in {serial,threaded,openmp}64{,_}; do
+    cp -pr netliblapack64 $d/netliblapack
+done
+rm -rf netliblapack64
+%endif
+%endif
+
+%build
+%define _lto_cflags %{nil}
+%if !%{lapacke}
+LAPACKE="NO_LAPACKE=1"
+%endif
+
+NMAX="NUM_THREADS=128"
+
+%ifarch x86_64
+TARGET="TARGET=CORE2 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1"
+%endif
+%ifarch aarch64
+TARGET="TARGET=ARMV8 DYNAMIC_ARCH=1 DYNAMIC_OLDER=1"
+%endif
+%ifarch loongarch64
+TARGET="TARGET=LOONGSONGENERIC DYNAMIC_ARCH=1"
+%endif
+
+COMMON="%{optflags} -fPIC"
+FCOMMON="%{optflags} -fPIC -frecursive"
+export LDFLAGS="%{__global_ldflags}"
+
+COMMON="%{optflags} -fPIC"
+FCOMMON="$COMMON -frecursive"
+make -C serial     $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas"      $AVX $LAPACKE INTERFACE64=0
+make -C threaded   $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp"     $AVX $LAPACKE INTERFACE64=0
+
+COMMON="%{optflags} -fPIC -fopenmp -pthread"
+FCOMMON="$COMMON -frecursive"
+make -C openmp     $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso"     $AVX $LAPACKE INTERFACE64=0 %{with cpp_thread_check:CPP_THREAD_SAFETY_TEST=1}
+
+%if %build64
+COMMON="%{optflags} -fPIC"
+FCOMMON="$COMMON -frecursive -fdefault-integer-8"
+make -C serial64   $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64"    $AVX $LAPACKE INTERFACE64=1
+make -C threaded64 $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64"   $AVX $LAPACKE INTERFACE64=1
+
+COMMON="%{optflags} -fPIC -fopenmp -pthread"
+FCOMMON="$COMMON -frecursive -fdefault-integer-8"
+make -C openmp64   $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64"   $AVX $LAPACKE INTERFACE64=1 CPP_THREAD_SAFETY_TEST=1
+
+COMMON="%{optflags} -fPIC"
+FCOMMON="$COMMON -frecursive  -fdefault-integer-8"
+make -C serial64_   $TARGET USE_THREAD=0 USE_LOCKING=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblas64_"  $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_
+make -C threaded64_ $TARGET USE_THREAD=1 USE_OPENMP=0 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblasp64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_
+
+COMMON="%{optflags} -fPIC -fopenmp -pthread"
+FCOMMON="$COMMON -frecursive -fdefault-integer-8"
+make -C openmp64_   $TARGET USE_THREAD=1 USE_OPENMP=1 FC=gfortran CC=gcc COMMON_OPT="$COMMON" FCOMMON_OPT="$FCOMMON" $NMAX LIBPREFIX="libopenblaso64_" $AVX $LAPACKE INTERFACE64=1 SYMBOLSUFFIX=64_ CPP_THREAD_SAFETY_TEST=1
+%endif
+
+%install
+make -C serial USE_THREAD=0 PREFIX=%{buildroot} OPENBLAS_LIBRARY_DIR=%{buildroot}%{_libdir} OPENBLAS_INCLUDE_DIR=%{buildroot}%{_includedir}/%name OPENBLAS_BINARY_DIR=%{buildroot}%{_bindir} OPENBLAS_CMAKE_DIR=%{buildroot}%{_libdir}/cmake install
+
+%if %{with system_lapack} && %{lapacke}
+cp -a %{_includedir}/lapacke %{buildroot}%{_includedir}/%{name}
+%endif
+
+suffix=""
+slibname=`basename %{buildroot}%{_libdir}/libopenblas${suffix}-*.so .so`
+mv %{buildroot}%{_libdir}/${slibname}.a %{buildroot}%{_libdir}/lib%{name}.a
+if [[ "$suffix" != "" ]]; then
+   sname=$(echo $slibname | sed "s|$suffix||g")
+   mv %{buildroot}%{_libdir}/${slibname}.so %{buildroot}%{_libdir}/${sname}.so
+else
+   sname=${slibname}
+fi
+
+olibname=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o|g"`
+install -D -pm 644 openmp/${olibname}.a %{buildroot}%{_libdir}/lib%{name}o.a
+if [[ "$suffix" != "" ]]; then
+   oname=$(echo $olibname | sed "s|$suffix||g")
+else
+   oname=${olibname}
+fi
+install -D -pm 755 openmp/${olibname}.so %{buildroot}%{_libdir}/${oname}.so
+
+plibname=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p|g"`
+install -D -pm 644 threaded/${plibname}.a %{buildroot}%{_libdir}/lib%{name}p.a
+if [[ "$suffix" != "" ]]; then
+   pname=$(echo $plibname | sed "s|$suffix||g")
+else
+   pname=${plibname}
+fi
+install -D -pm 755 threaded/${plibname}.so %{buildroot}%{_libdir}/${pname}.so
+
+%if %build64
+slibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64|g"`
+install -D -pm 644 serial64/${slibname64}.a %{buildroot}%{_libdir}/lib%{name}64.a
+slibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}64_|g"`
+install -D -pm 644 serial64_/${slibname64_}.a %{buildroot}%{_libdir}/lib%{name}64_.a
+
+if [[ "$suffix" != "" ]]; then
+   sname64=$(echo ${slibname64} | sed "s|$suffix||g")
+   sname64_=$(echo ${slibname64_} | sed "s|$suffix||g")
+else
+   sname64=${slibname64}
+   sname64_=${slibname64_}
+fi
+install -D -pm 755 serial64/${slibname64}.so %{buildroot}%{_libdir}/${sname64}.so
+install -D -pm 755 serial64_/${slibname64_}.so %{buildroot}%{_libdir}/${sname64_}.so
+
+olibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o64|g"`
+install -D -pm 644 openmp64/${olibname64}.a %{buildroot}%{_libdir}/lib%{name}o64.a
+olibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}o64_|g"`
+install -D -pm 644 openmp64_/${olibname64_}.a %{buildroot}%{_libdir}/lib%{name}o64_.a
+
+if [[ "$suffix" != "" ]]; then
+   oname64=$(echo ${olibname64} | sed "s|$suffix||g")
+   oname64_=$(echo ${olibname64_} | sed "s|$suffix||g")
+else
+   oname64=${olibname64}
+   oname64_=${olibname64_}
+fi
+install -D -pm 755 openmp64/${olibname64}.so %{buildroot}%{_libdir}/${oname64}.so
+install -D -pm 755 openmp64_/${olibname64_}.so %{buildroot}%{_libdir}/${oname64_}.so
+
+plibname64=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p64|g"`
+install -D -pm 644 threaded64/${plibname64}.a %{buildroot}%{_libdir}/lib%{name}p64.a
+plibname64_=`echo ${slibname} | sed "s|lib%{name}|lib%{name}p64_|g"`
+install -D -pm 644 threaded64_/${plibname64_}.a %{buildroot}%{_libdir}/lib%{name}p64_.a
+
+if [[ "$suffix" != "" ]]; then
+   pname64=$(echo $plibname64 | sed "s|$suffix||g")
+   pname64_=$(echo $plibname64_ | sed "s|$suffix||g")
+else
+   pname64=${plibname64}
+   pname64_=${plibname64_}
+fi
+install -D -pm 755 threaded64/${plibname64}.so %{buildroot}%{_libdir}/${pname64}.so
+install -D -pm 755 threaded64_/${plibname64_}.so %{buildroot}%{_libdir}/${pname64_}.so
+%endif
+
+pushd %{buildroot}%{_libdir}
+ln -sf ${sname}.so lib%{name}.so.0
+ln -sf ${sname}.so lib%{name}.so
+ln -sf ${oname}.so lib%{name}o.so.0
+ln -sf ${oname}.so lib%{name}o.so
+ln -sf ${pname}.so lib%{name}p.so.0
+ln -sf ${pname}.so lib%{name}p.so
+
+%if %build64
+ln -sf ${sname64}.so lib%{name}64.so.0
+ln -sf ${sname64}.so lib%{name}64.so
+ln -sf ${sname64_}.so lib%{name}64_.so.0
+ln -sf ${sname64_}.so lib%{name}64_.so
+ln -sf ${oname64}.so lib%{name}o64.so.0
+ln -sf ${oname64}.so lib%{name}o64.so
+ln -sf ${oname64_}.so lib%{name}o64_.so.0
+ln -sf ${oname64_}.so lib%{name}o64_.so
+ln -sf ${pname64}.so lib%{name}p64.so.0
+ln -sf ${pname64}.so lib%{name}p64.so
+ln -sf ${pname64_}.so lib%{name}p64_.so.0
+ln -sf ${pname64_}.so lib%{name}p64_.so
+%endif
+
+%if %{execstack}
+for lib in %{buildroot}%{_libdir}/libopenblas*.so; do
+ execstack -c $lib
+done
+%endif
+
+rm -rf %{buildroot}%{_libdir}/cmake
+rm -rf %{buildroot}%{_libdir}/pkgconfig
+
+%files
+%license serial/LICENSE
+%doc serial/Changelog.txt serial/GotoBLAS* 
+
+%files serial
+%{_libdir}/lib%{name}.so.*
+%{_libdir}/lib%{name}-*.so
+
+%files openmp
+%{_libdir}/lib%{name}o.so.*
+%{_libdir}/lib%{name}o-*.so
+
+%files threads
+%{_libdir}/lib%{name}p.so.*
+%{_libdir}/lib%{name}p-*.so
+
+%if %build64
+%files serial64
+%{_libdir}/lib%{name}64.so.*
+%{_libdir}/lib%{name}64-*.so
+
+%files openmp64
+%{_libdir}/lib%{name}o64.so.*
+%{_libdir}/lib%{name}o64-*.so
+
+%files threads64
+%{_libdir}/lib%{name}p64.so.*
+%{_libdir}/lib%{name}p64-*.so
+
+%files serial64_
+%{_libdir}/lib%{name}64_.so.*
+%{_libdir}/lib%{name}64_-*.so
+
+%files openmp64_
+%{_libdir}/lib%{name}o64_.so.*
+%{_libdir}/lib%{name}o64_-*.so
+
+%files threads64_
+%{_libdir}/lib%{name}p64_.so.*
+%{_libdir}/lib%{name}p64_-*.so
+%endif
+
+%files devel
+%{_includedir}/%{name}/
+%{_libdir}/lib%{name}.so
+%{_libdir}/lib%{name}p.so
+%{_libdir}/lib%{name}o.so
+%if %build64
+%{_libdir}/lib%{name}p64_.so
+%{_libdir}/lib%{name}o64.so
+%{_libdir}/lib%{name}p64.so
+%{_libdir}/lib%{name}64_.so
+%{_libdir}/lib%{name}o64_.so
+%{_libdir}/lib%{name}64.so
+%endif
+
+%files static
+%{_libdir}/lib%{name}.a
+%{_libdir}/lib%{name}o.a
+%{_libdir}/lib%{name}p.a
+%if %build64
+%{_libdir}/lib%{name}o64.a
+%{_libdir}/lib%{name}p64.a
+%{_libdir}/lib%{name}o64_.a
+%{_libdir}/lib%{name}p64_.a
+%{_libdir}/lib%{name}64.a
+%{_libdir}/lib%{name}64_.a
+%endif
+
+%changelog
+* Tue Oct 31 2023 XiWei Gu <guxiwei-hf@loongson.cn> - 0.3.23-2
+- Add opt for LoongArch64
+
+* Sun Apr 16 2023 Funda Wang <fundawang@yeah.net> - 0.3.23-1
+- New version 0.3.23
+
+* Sat Apr 15 2023 Heng Qi <hengqi@linux.alibaba.com> - 0.3.21-2
+- Refactor the spec file
+
+* Fri Jan 27 2023 Funda Wang <fundawang@yeah.net> - 0.3.21-1
+- Import package for anolis 23