diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..23d4bb35443429c9ab12932e3ecc6502ac8f4863
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,50 @@
+vectorBlas is licensed under the Apache License.
+
+Copyright (C) 2023. Huawei Technologies Co., Ltd.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+-----------------------------------------------------------------------------
+
+This product also contains code from third parties, under the following licenses:
+
+f2jblas
+-------
+
+Copyright © 2022 The University of Tennessee. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+· Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+· Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer listed in this license in the documentation
+ and/or other materials provided with the distribution.
+· Neither the name of the copyright holders nor the names of its contributors may be used to endorse
+ or promote products derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and any express or implied warranties,
+including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose
+are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect,
+incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of
+substitute goods or services; loss of use, data, or profits; or business interruption) however caused and
+on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise)
+arising in any way out of the use of this software, even if advised of the possibility of such damage.
+
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8d54c1b2ca4b3b3682db043028a7c0cd4c405cd1
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,35 @@
+
+
+ 4.0.0
+
+ com.huawei.vector
+ parent
+ 1.0
+ pom
+
+
+ vectorBlas
+
+
+
+ 8
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.0
+
+ UTF-8
+ ${java.version}
+ ${java.version}
+ ${java.version}
+
+
+
+
+
diff --git a/vectorBlas/pom.xml b/vectorBlas/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9d1da1be401b26bdce2ed3b852d1d7ee5a8d3c94
--- /dev/null
+++ b/vectorBlas/pom.xml
@@ -0,0 +1,85 @@
+
+
+ 4.0.0
+
+
+ com.huawei.vector
+ parent
+ 1.0
+ ../pom.xml
+
+
+ vectorBlas
+ 1.0
+ jar
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M7
+
+ @{argLine} --add-modules=jdk.incubator.vector
+
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.8
+
+
+
+ prepare-agent
+
+
+
+ default-report
+ test
+
+ report
+
+
+
+
+
+
+
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+ 5.9.1
+ test
+
+
+ org.junit.vintage
+ junit-vintage-engine
+ 5.9.1
+ test
+
+
+ net.sourceforge.f2j
+ arpack_combined_all
+ 0.1
+ compile
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.8
+
+
+ org.slf4j
+ slf4j-api
+ 2.0.4
+
+
+ org.slf4j
+ slf4j-simple
+ 2.0.4
+
+
+
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java
new file mode 100644
index 0000000000000000000000000000000000000000..adac74745ad00d58437b0737895ea26785f11623
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java
@@ -0,0 +1,346 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas;
+
+public interface BLAS {
+ // BLAS 1
+ double dasum(int n, double[] x, int incx);
+
+ double dasum(int n, double[] x, int xOffset, int incx);
+
+ float sasum(int n, float[] x, int incx);
+
+ float sasum(int n, float[] x, int xOffset, int incx);
+
+ void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy);
+
+ void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy);
+
+ void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy);
+
+ void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy);
+
+ void dcopy(int n, double[] x, int incx, double[] y, int incy);
+
+ void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy);
+
+ void scopy(int n, float[] x, int incx, float[] y, int incy);
+
+ void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy);
+
+ double ddot(int n, double[] x, int incx, double[] y, int incy);
+
+ double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy);
+
+ float sdot(int n, float[] x, int incx, float[] y, int incy);
+
+ float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy);
+
+ float snrm2(int n, float[] x, int incx);
+
+ float snrm2(int n, float[] x, int xOffset, int incx);
+
+ double dnrm2(int n, double[] x, int incx);
+
+ double dnrm2(int n, double[] x, int xOffset, int incx);
+
+ void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s);
+
+ void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s);
+
+ void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s);
+
+ void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s);
+
+ void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param);
+
+ void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param,
+ int paramOffset);
+
+ void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param);
+
+ void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param,
+ int paramOffset);
+
+ void sscal(int n, float alp, float[] x, int incx);
+
+ void sscal(int n, float alp, float[] x, int xOffset, int incx);
+
+ void dscal(int n, double alp, double[] x, int incx);
+
+ void dscal(int n, double alp, double[] x, int xOffset, int incx);
+
+ void sswap(int n, float[] x, int incx, float[] y, int incy);
+
+ void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy);
+
+ void dswap(int n, double[] x, int incx, double[] y, int incy);
+
+ void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy);
+
+ int isamax(int n, float[] x, int incx);
+
+ int isamax(int n, float[] x, int xOffset, int incx);
+
+ int idamax(int n, double[] x, int incx);
+
+ int idamax(int n, double[] x, int xOffset, int incx);
+
+ // BLAS 2
+ void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int lda, double[] x,
+ int incx, double beta, double[] y, int incy);
+
+ void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int aOffset,
+ int lda, double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy);
+
+ void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int lda, float[] x,
+ int incx, float beta, float[] y, int incy);
+
+ void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx, float beta, float[] y, int yOffset, int incy);
+
+ void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x,
+ int incx, double beta, double[] y, int incy);
+
+ void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy);
+
+ void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x,
+ int incx, float beta, float[] y, int incy);
+
+ void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy);
+
+ void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda);
+
+ void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset,
+ int incy, double[] a, int aOffset, int lda);
+
+ void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda);
+
+ void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda);
+
+ void dsbmv(String uplo, int n, int k, double alpha, double[] a, int lda, double[] x, int incx,
+ double beta, double[] y, int incy);
+
+ void dsbmv(String uplo, int n, int k, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy);
+
+ void ssbmv(String uplo, int n, int k, float alpha, float[] a, int lda, float[] x, int incx,
+ float beta, float[] y, int incy);
+
+ void ssbmv(String uplo, int n, int k, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy);
+
+ void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta, double[] y, int incy);
+
+ void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset,
+ int incx, double beta, double[] y, int yOffset, int incy);
+
+ void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta, float[] y, int incy);
+
+ void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset,
+ int incx, float beta, float[] y, int yOffset, int incy);
+
+ void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap);
+
+ void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset);
+
+ void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap);
+
+ void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset);
+
+ void dspr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a);
+
+ void dspr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y,
+ int yOffset, int incy, double[] a, int aOffset);
+
+ void sspr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a);
+
+ void sspr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y,
+ int yOffset, int incy, float[] a, int aOffset);
+
+ void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy);
+
+ void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy);
+
+ void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy);
+
+ void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ int incx, float beta, float[] y, int yOffset, int incy);
+
+ void dsyr(String uplo, int n, double alpha, double[] x, int incx, double[] a, int lda);
+
+ void dsyr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] a, int aOffset, int lda);
+
+ void ssyr(String uplo, int n, float alpha, float[] x, int incx, float[] a, int lda);
+
+ void ssyr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] a, int aOffset, int lda);
+
+ void dsyr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda);
+
+ void dsyr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y,
+ int yOffset, int incy, double[] a, int aOffset, int lda);
+
+ void ssyr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda);
+
+ void ssyr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y,
+ int yOffset, int incy, float[] a, int aOffset, int lda);
+
+ void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx);
+
+ void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx);
+
+ void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx);
+
+ void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx);
+
+ void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx);
+
+ void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx);
+
+ void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx);
+
+ void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx);
+
+ void dtpmv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx);
+
+ void dtpmv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x,
+ int xOffset, int incx);
+
+ void stpmv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx);
+
+ void stpmv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x,
+ int xOffset, int incx);
+
+ void dtpsv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx);
+
+ void dtpsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x,
+ int xOffset, int incx);
+
+ void stpsv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx);
+
+ void stpsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x,
+ int xOffset, int incx);
+
+ void dtrmv(String uplo, String trans, String diag, int n, double[] a, int lda, double[] x, int incx);
+
+ void dtrmv(String uplo, String trans, String diag, int n, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx);
+
+ void strmv(String uplo, String trans, String diag, int n, float[] a, int lda, float[] x, int incx);
+
+ void strmv(String uplo, String trans, String diag, int n, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx);
+
+ void dtrsv(String uplo, String transa, String diag, int n, double[] a, int lda, double[] x, int incx);
+
+ void dtrsv(String uplo, String transa, String diag, int n, double[] a,
+ int aOffset, int lda, double[] x, int xOffset, int incx);
+
+ void strsv(String uplo, String transa, String diag, int n, float[] a, int lda, float[] x, int incx);
+
+ void strsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx);
+
+ // BLAS 3
+ void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc);
+
+ void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset,
+ int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc);
+
+ void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a,
+ int lda, float[] b, int ldb, float beta, float[] c, int ldc);
+
+ void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset,
+ int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc);
+
+ void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc);
+
+ void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc);
+
+ void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc);
+
+ void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc);
+
+ void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc);
+
+ void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc);
+
+ void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc);
+
+ void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc);
+
+ void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int lda,
+ double beta, double[] c, int ldc);
+
+ void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda,
+ double beta, double[] c, int cOffset, int ldc);
+
+ void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int lda,
+ float beta, float[] c, int ldc);
+
+ void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda,
+ float beta, float[] c, int cOffset, int ldc);
+
+ void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int lda, double[] b, int ldb);
+
+ void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int aOffset, int lda, double[] b, int bOffset, int ldb);
+
+ void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int lda, float[] b, int ldb);
+
+ void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int aOffset, int lda, float[] b, int bOffset, int ldb);
+
+ void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int lda, double[] b, int ldb);
+
+ void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int aOffset, int lda, double[] b, int bOffset, int ldb);
+
+ void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int lda, float[] b, int ldb);
+
+ void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int aOffset, int lda, float[] b, int bOffset, int ldb);
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java
new file mode 100644
index 0000000000000000000000000000000000000000..8d4de111a559ea60d89792625dcf2fc5c31a3992
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas;
+
+public class F2jBLAS implements BLAS {
+ /**
+ * F2jblas dasum fixed version, use long to store (n * incx) to avoid int overflow.
+ */
+ @Override
+ public double dasum(int n, double[] x, int xOffset, int incx) {
+ int unrollSize = 6;
+ double dasum = 0.0D;
+ if (n <= 0 || incx <= 0) {
+ return dasum;
+ } else {
+ int index;
+ if (incx == 1) {
+ int restm = n % unrollSize;
+ if (restm != 0) {
+ index = 1;
+ for (int i = restm; i > 0; --i) {
+ dasum += Math.abs(x[index - 1 + xOffset]);
+ ++index;
+ }
+ if (n < unrollSize) {
+ return dasum;
+ }
+ }
+ int mp1 = restm + 1;
+ index = mp1;
+
+ for (int i = (n - mp1 + unrollSize) / unrollSize; i > 0; --i) {
+ dasum = dasum + Math.abs(x[index - 1 + xOffset]) + Math.abs(x[index + xOffset])
+ + Math.abs(x[index + 1 + xOffset]) + Math.abs(x[index + 2 + xOffset])
+ + Math.abs(x[index + 3 + xOffset]) + Math.abs(x[index + 4 + xOffset]);
+ index += unrollSize;
+ }
+ return dasum;
+ } else {
+ long nIncx = (long) n * incx;
+ index = 1;
+ for (long i = (nIncx - 1 + incx) / incx; i > 0; --i) {
+ dasum += Math.abs(x[index - 1 + xOffset]);
+ index += incx;
+ }
+ return dasum;
+ }
+ }
+ }
+
+ /**
+ * dasum without offset
+ */
+ @Override
+ public double dasum(int n, double[] x, int incx) {
+ return dasum(n, x, 0, incx);
+ }
+
+
+ @Override
+ public float sasum(int n, float[] x, int incx) {
+ return sasum(n, x, 0, incx);
+ }
+
+ /**
+ * F2jblas sasum fixed version, use long to store (n * incx) to avoid int overflow.
+ */
+ @Override
+ public float sasum(int n, float[] x, int xOffset, int incx) {
+ int unrollSize = 6;
+ float sasum = 0.0F;
+ if (n <= 0 || incx <= 0) {
+ return sasum;
+ } else {
+ int index;
+ if (incx == 1) {
+ int restm = n % unrollSize;
+ if (restm != 0) {
+ index = 1;
+ for (int i = restm; i > 0; --i) {
+ sasum += Math.abs(x[index - 1 + xOffset]);
+ ++index;
+ }
+ if (n < unrollSize) {
+ return sasum;
+ }
+ }
+ int mp1 = restm + 1;
+ index = mp1;
+
+ for (int i = (n - mp1 + unrollSize) / unrollSize; i > 0; --i) {
+ sasum = sasum + Math.abs(x[index - 1 + xOffset]) + Math.abs(x[index + xOffset])
+ + Math.abs(x[index + 1 + xOffset]) + Math.abs(x[index + 2 + xOffset])
+ + Math.abs(x[index + 3 + xOffset]) + Math.abs(x[index + 4 + xOffset]);
+ index += unrollSize;
+ }
+ return sasum;
+ } else {
+ long nIncx = (long) n * incx;
+ index = 1;
+ for (long i = (nIncx - 1 + incx) / incx; i > 0; --i) {
+ sasum += Math.abs(x[index - 1 + xOffset]);
+ index += incx;
+ }
+ return sasum;
+ }
+ }
+ }
+
+ @Override
+ public void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy) {
+ org.netlib.blas.Daxpy.daxpy(n, alpha, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Daxpy.daxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy) {
+ org.netlib.blas.Saxpy.saxpy(n, alpha, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Saxpy.saxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void dcopy(int n, double[] x, int incx, double[] y, int incy) {
+ org.netlib.blas.Dcopy.dcopy(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dcopy.dcopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void scopy(int n, float[] x, int incx, float[] y, int incy) {
+ org.netlib.blas.Scopy.scopy(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Scopy.scopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public double ddot(int n, double[] x, int incx, double[] y, int incy) {
+ return org.netlib.blas.Ddot.ddot(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ return org.netlib.blas.Ddot.ddot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public float sdot(int n, float[] x, int incx, float[] y, int incy) {
+ return org.netlib.blas.Sdot.sdot(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ return org.netlib.blas.Sdot.sdot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public double dnrm2(int n, double[] x, int incx) {
+ return dnrm2(n, x, 0, incx);
+ }
+
+ /**
+ * F2jblas dnrm2 fixed version, use long to store (n * incx) to avoid int overflow.
+ */
+ @Override
+ public double dnrm2(int n, double[] x, int xOffset, int incx) {
+ double absxi = 0.0D;
+ double norm = 0.0D;
+ double scale = 0.0D;
+ double ssq = 0.0D;
+ if (n < 1 || incx < 1) {
+ norm = 0.0;
+ } else if (n == 1) {
+ norm = Math.abs(x[xOffset]);
+ } else {
+ scale = 0.0;
+ ssq = 1.0D;
+ int ix = 1;
+ for (long i = ((long) n * incx) / incx; i > 0; --i) {
+ if (x[ix - 1 + xOffset] != 0.0) {
+ absxi = Math.abs(x[ix - 1 + xOffset]);
+ if (scale < absxi) {
+ ssq = 1.0D + ssq * Math.pow(scale / absxi, (double) 2);
+ scale = absxi;
+ } else {
+ ssq += Math.pow(absxi / scale, (double) 2);
+ }
+ }
+ ix += incx;
+ }
+ norm = scale * Math.sqrt(ssq);
+ }
+ return norm;
+ }
+
+ @Override
+ public float snrm2(int n, float[] x, int incx) {
+ return snrm2(n, x, 0, incx);
+ }
+
+ /**
+ * F2jblas snrm2 fixed version, use long to store (n * incx) to avoid int overflow.
+ */
+ @Override
+ public float snrm2(int n, float[] x, int xOffset, int incx) {
+ float absxi = 0.0F;
+ float norm = 0.0F;
+ float scale = 0.0F;
+ float ssq = 0.0F;
+ if (n < 1 || incx < 1) {
+ norm = 0.0F;
+ } else if (n == 1) {
+ norm = Math.abs(x[xOffset]);
+ } else {
+ scale = 0.0F;
+ ssq = 1.0F;
+ int ix = 1;
+ for (long i = ((long) n * incx) / incx; i > 0; --i) {
+ if (x[ix - 1 + xOffset] != 0.0F) {
+ absxi = Math.abs(x[ix - 1 + xOffset]);
+ if (scale < absxi) {
+ ssq = 1.0F + ssq * (float) Math.pow(scale / absxi, 2);
+ scale = absxi;
+ } else {
+ ssq += Math.pow(absxi / scale, 2);
+ }
+ }
+ ix += incx;
+ }
+ norm = scale * (float) Math.sqrt(ssq);
+ }
+ return norm;
+ }
+
+ @Override
+ public void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s) {
+ org.netlib.blas.Srot.srot(n, x, 0, incx, y, 0, incy, c, s);
+ }
+
+ @Override
+ public void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s) {
+ org.netlib.blas.Srot.srot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+
+ @Override
+ public void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s) {
+ org.netlib.blas.Drot.drot(n, x, 0, incx, y, 0, incy, c, s);
+ }
+
+ @Override
+ public void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s) {
+ org.netlib.blas.Drot.drot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+
+ @Override
+ public void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param) {
+ srotm(n, x, 0, incx, y, 0, incy, param, 0);
+ }
+
+ /**
+ * f2jblas srotm fixed version, use long to store (n * incx) to avoid int overflow
+ */
+ @Override
+ public void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param,
+ int paramOffset) {
+ float flag = 0.0F;
+ float h11 = 0.0F;
+ float h12 = 0.0F;
+ float h21 = 0.0F;
+ float h22 = 0.0F;
+ float wi = 0.0F;
+ float zi = 0.0F;
+ flag = param[paramOffset];
+ if (n > 0 && Float.compare(flag, -2.0F) != 0) { // If flag equals -2.0, do nothing and return directly.
+ int index;
+ if ((incx == incy && incx > 0) ^ true) {
+ int xIndex = 1;
+ int yIndex = 1;
+ if (incx < 0) {
+ xIndex = 1 + (1 - n) * incx;
+ }
+ if (incy < 0) {
+ yIndex = 1 + (1 - n) * incy;
+ }
+ if (flag < 0.0) {
+ h11 = param[2 - 1 + paramOffset];
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi * h11 + zi * h12;
+ y[yIndex - 1 + yOffset] = wi * h21 + zi * h22;
+ xIndex += incx;
+ yIndex += incy;
+ ++index;
+ }
+ } else if (flag == 0.0) {
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ index = 1;
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi + zi * h12;
+ y[yIndex - 1 + yOffset] = wi * h21 + zi;
+ xIndex += incx;
+ yIndex += incy;
+ ++index;
+ }
+ } else {
+ h11 = param[2 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi * h11 + zi;
+ y[yIndex - 1 + yOffset] = -wi + h22 * zi;
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+ } else {
+ long nSteps = (long) n * incx;
+ if (flag < 0.0) {
+ h11 = param[2 - 1 + paramOffset];
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi * h11 + zi * h12;
+ y[index - 1 + yOffset] = wi * h21 + zi * h22;
+ index += incx;
+ }
+ } else if (flag == 0.0) {
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi + zi * h12;
+ y[index - 1 + yOffset] = wi * h21 + zi;
+ index += incx;
+ }
+ } else {
+ h11 = param[2 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi * h11 + zi;
+ y[index - 1 + yOffset] = -wi + h22 * zi;
+ index += incx;
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param) {
+ drotm(n, x, 0, incx, y, 0, incy, param, 0);
+ }
+
+ /**
+ * F2jblas drotm fixed version, use long to store (n * incx) to avoid int overflow.
+ */
+ @Override
+ public void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param,
+ int paramOffset) {
+ double flag = 0.0D;
+ double h11 = 0.0D;
+ double h12 = 0.0D;
+ double h21 = 0.0D;
+ double h22 = 0.0D;
+ double wi = 0.0D;
+ double zi = 0.0D;
+ flag = param[paramOffset];
+ if (n > 0 && Double.compare(flag, -2.0D) != 0) { // If flag equals -2.0, do nothing and return directly.
+ int index;
+ if ((incx == incy && incx > 0) ^ true) {
+ int xIndex = 1;
+ int yIndex = 1;
+ if (incx < 0) {
+ xIndex = 1 + (1 - n) * incx;
+ }
+ if (incy < 0) {
+ yIndex = 1 + (1 - n) * incy;
+ }
+ if (flag < 0.0) {
+ h11 = param[2 - 1 + paramOffset];
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi * h11 + zi * h12;
+ y[yIndex - 1 + yOffset] = wi * h21 + zi * h22;
+ xIndex += incx;
+ yIndex += incy;
+ ++index;
+ }
+ } else if (flag == 0.0) {
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ index = 1;
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi + zi * h12;
+ y[yIndex - 1 + yOffset] = wi * h21 + zi;
+ xIndex += incx;
+ yIndex += incy;
+ ++index;
+ }
+ } else {
+ h11 = param[2 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ for (int i = n; i > 0; --i) {
+ wi = x[xIndex - 1 + xOffset];
+ zi = y[yIndex - 1 + yOffset];
+ x[xIndex - 1 + xOffset] = wi * h11 + zi;
+ y[yIndex - 1 + yOffset] = -wi + h22 * zi;
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+ } else {
+ long nSteps = (long) n * incx;
+ if (flag < 0.0) {
+ h11 = param[2 - 1 + paramOffset];
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi * h11 + zi * h12;
+ y[index - 1 + yOffset] = wi * h21 + zi * h22;
+ index += incx;
+ }
+ } else if (flag == 0.0) {
+ h12 = param[4 - 1 + paramOffset];
+ h21 = param[3 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi + zi * h12;
+ y[index - 1 + yOffset] = wi * h21 + zi;
+ index += incx;
+ }
+ } else {
+ h11 = param[2 - 1 + paramOffset];
+ h22 = param[5 - 1 + paramOffset];
+ index = 1;
+ for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) {
+ wi = x[index - 1 + xOffset];
+ zi = y[index - 1 + yOffset];
+ x[index - 1 + xOffset] = wi * h11 + zi;
+ y[index - 1 + yOffset] = -wi + h22 * zi;
+ index += incx;
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public void sscal(int n, float alp, float[] x, int incx) {
+ org.netlib.blas.Sscal.sscal(n, alp, x, 0, incx);
+ }
+
+ @Override
+ public void sscal(int n, float alp, float[] x, int xOffset, int incx) {
+ org.netlib.blas.Sscal.sscal(n, alp, x, xOffset, incx);
+ }
+
+ @Override
+ public void dscal(int n, double alp, double[] x, int incx) {
+ org.netlib.blas.Dscal.dscal(n, alp, x, 0, incx);
+ }
+
+ @Override
+ public void dscal(int n, double alp, double[] x, int xOffset, int incx) {
+ org.netlib.blas.Dscal.dscal(n, alp, x, xOffset, incx);
+ }
+
+ @Override
+ public void sswap(int n, float[] x, int incx, float[] y, int incy) {
+ org.netlib.blas.Sswap.sswap(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Sswap.sswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void dswap(int n, double[] x, int incx, double[] y, int incy) {
+ org.netlib.blas.Dswap.dswap(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dswap.dswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public int isamax(int n, float[] x, int incx) {
+ return org.netlib.blas.Isamax.isamax(n, x, 0, incx);
+ }
+
+ @Override
+ public int isamax(int n, float[] x, int xOffset, int incx) {
+ return org.netlib.blas.Isamax.isamax(n, x, xOffset, incx);
+ }
+
+ @Override
+ public int idamax(int n, double[] x, int incx) {
+ return org.netlib.blas.Idamax.idamax(n, x, 0, incx);
+ }
+
+ @Override
+ public int idamax(int n, double[] x, int xOffset, int incx) {
+ return org.netlib.blas.Idamax.idamax(n, x, xOffset, incx);
+ }
+
+ @Override
+ public void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int lda,
+ double[] x, int incx, double beta, double[] y, int incy) {
+ org.netlib.blas.Dgbmv.dgbmv(trans, m, n, kl, ku, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dgbmv.dgbmv(
+ trans, m, n, kl, ku, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int lda, float[] x, int incx,
+ float beta, float[] y, int incy) {
+ org.netlib.blas.Sgbmv.sgbmv(trans, m, n, kl, ku, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx, float beta, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Sgbmv.sgbmv(
+ trans, m, n, kl, ku, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ org.netlib.blas.Dgemv.dgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dgemv.dgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ org.netlib.blas.Sgemv.sgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Sgemv.sgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda) {
+ org.netlib.blas.Dger.dger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy,
+ double[] a, int aOffset, int lda) {
+ org.netlib.blas.Dger.dger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) {
+ org.netlib.blas.Sger.sger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy,
+ float[] a, int aOffset, int lda) {
+ org.netlib.blas.Sger.sger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void dsbmv(String uplo, int n, int k, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ org.netlib.blas.Dsbmv.dsbmv(uplo, n, k, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dsbmv(String uplo, int n, int k, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dsbmv.dsbmv(uplo, n, k, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void ssbmv(String uplo, int n, int k, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ org.netlib.blas.Ssbmv.ssbmv(uplo, n, k, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void ssbmv(String uplo, int n, int k, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Ssbmv.ssbmv(uplo, n, k, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ org.netlib.blas.Dspmv.dspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx,
+ double beta, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dspmv.dspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ org.netlib.blas.Sspmv.sspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx,
+ float beta, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Sspmv.sspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap) {
+ org.netlib.blas.Dspr.dspr(uplo, n, alpha, x, 0, incx, ap, 0);
+ }
+
+ @Override
+ public void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset) {
+ org.netlib.blas.Dspr.dspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset);
+ }
+
+ @Override
+ public void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap) {
+ org.netlib.blas.Sspr.sspr(uplo, n, alpha, x, 0, incx, ap, 0);
+ }
+
+ @Override
+ public void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset) {
+ org.netlib.blas.Sspr.sspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset);
+ }
+
+ @Override
+ public void dspr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a) {
+ org.netlib.blas.Dspr2.dspr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0);
+ }
+
+ @Override
+ public void dspr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset,
+ int incy, double[] a, int aOffset) {
+ org.netlib.blas.Dspr2.dspr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset);
+ }
+
+ @Override
+ public void sspr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a) {
+ org.netlib.blas.Sspr2.sspr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0);
+ }
+
+ @Override
+ public void sspr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy, float[] a, int aOffset) {
+ org.netlib.blas.Sspr2.sspr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset);
+ }
+
+ @Override
+ public void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ org.netlib.blas.Dsymv.dsymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, int xOffset,
+ int incx, double beta, double[] y, int yOffset, int incy) {
+ org.netlib.blas.Dsymv.dsymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ org.netlib.blas.Ssymv.ssymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ int incx, float beta, float[] y, int yOffset, int incy) {
+ org.netlib.blas.Ssymv.ssymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dsyr(String uplo, int n, double alpha, double[] x, int incx, double[] a, int lda) {
+ org.netlib.blas.Dsyr.dsyr(uplo, n, alpha, x, 0, incx, a, 0, lda);
+ }
+
+ @Override
+ public void dsyr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] a, int aOffset,
+ int lda) {
+ org.netlib.blas.Dsyr.dsyr(uplo, n, alpha, x, xOffset, incx, a, aOffset, lda);
+ }
+
+ @Override
+ public void ssyr(String uplo, int n, float alpha, float[] x, int incx, float[] a, int lda) {
+ org.netlib.blas.Ssyr.ssyr(uplo, n, alpha, x, 0, incx, a, 0, lda);
+ }
+
+ @Override
+ public void ssyr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] a, int aOffset,
+ int lda) {
+ org.netlib.blas.Ssyr.ssyr(uplo, n, alpha, x, xOffset, incx, a, aOffset, lda);
+ }
+
+ @Override
+ public void dsyr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy,
+ double[] a, int lda) {
+ org.netlib.blas.Dsyr2.dsyr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void dsyr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset,
+ int incy, double[] a, int aOffset, int lda) {
+ org.netlib.blas.Dsyr2.dsyr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void ssyr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) {
+ org.netlib.blas.Ssyr2.ssyr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void ssyr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda) {
+ org.netlib.blas.Ssyr2.ssyr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx) {
+ org.netlib.blas.Dtbmv.dtbmv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx) {
+ org.netlib.blas.Dtbmv.dtbmv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx) {
+ org.netlib.blas.Stbmv.stbmv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx) {
+ org.netlib.blas.Stbmv.stbmv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx) {
+ org.netlib.blas.Dtbsv.dtbsv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx) {
+ org.netlib.blas.Dtbsv.dtbsv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx) {
+ org.netlib.blas.Stbsv.stbsv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx) {
+ org.netlib.blas.Stbsv.stbsv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void dtpmv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx) {
+ org.netlib.blas.Dtpmv.dtpmv(uplo, transa, diag, n, a, 0, x, 0, incx);
+ }
+
+ @Override
+ public void dtpmv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x,
+ int xOffset, int incx) {
+ org.netlib.blas.Dtpmv.dtpmv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx);
+ }
+
+ @Override
+ public void stpmv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx) {
+ org.netlib.blas.Stpmv.stpmv(uplo, transa, diag, n, a, 0, x, 0, incx);
+ }
+
+ @Override
+ public void stpmv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x,
+ int xOffset, int incx) {
+ org.netlib.blas.Stpmv.stpmv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx);
+ }
+
+ @Override
+ public void dtpsv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx) {
+ org.netlib.blas.Dtpsv.dtpsv(uplo, transa, diag, n, a, 0, x, 0, incx);
+ }
+
+ @Override
+ public void dtpsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x,
+ int xOffset, int incx) {
+ org.netlib.blas.Dtpsv.dtpsv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx);
+ }
+
+ @Override
+ public void stpsv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx) {
+ org.netlib.blas.Stpsv.stpsv(uplo, transa, diag, n, a, 0, x, 0, incx);
+ }
+
+ @Override
+ public void stpsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x,
+ int xOffset, int incx) {
+ org.netlib.blas.Stpsv.stpsv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx);
+ }
+
+ @Override
+ public void dtrmv(String uplo, String trans, String diag, int n, double[] a, int lda, double[] x, int incx) {
+ org.netlib.blas.Dtrmv.dtrmv(uplo, trans, diag, n, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void dtrmv(String uplo, String trans, String diag, int n, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx) {
+ org.netlib.blas.Dtrmv.dtrmv(uplo, trans, diag, n, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void strmv(String uplo, String trans, String diag, int n, float[] a, int lda, float[] x, int incx) {
+ org.netlib.blas.Strmv.strmv(uplo, trans, diag, n, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void strmv(String uplo, String trans, String diag, int n, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx) {
+ org.netlib.blas.Strmv.strmv(uplo, trans, diag, n, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void dtrsv(String uplo, String transa, String diag, int n, double[] a, int lda, double[] x, int incx) {
+ org.netlib.blas.Dtrsv.dtrsv(uplo, transa, diag, n, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void dtrsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx) {
+ org.netlib.blas.Dtrsv.dtrsv(uplo, transa, diag, n, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void strsv(String uplo, String transa, String diag, int n, float[] a, int lda, float[] x, int incx) {
+ org.netlib.blas.Strsv.strsv(uplo, transa, diag, n, a, 0, lda, x, 0, incx);
+ }
+
+ @Override
+ public void strsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx) {
+ org.netlib.blas.Strsv.strsv(uplo, transa, diag, n, a, aOffset, lda, x, xOffset, incx);
+ }
+
+ @Override
+ public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc) {
+ org.netlib.blas.Dgemm.dgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset,
+ int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ org.netlib.blas.Dgemm.dgemm(
+ transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc) {
+ org.netlib.blas.Sgemm.sgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset,
+ int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ org.netlib.blas.Sgemm.sgemm(
+ transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc) {
+ org.netlib.blas.Dsymm.dsymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ org.netlib.blas.Dsymm.dsymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc) {
+ org.netlib.blas.Ssymm.ssymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ org.netlib.blas.Ssymm.ssymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc) {
+ org.netlib.blas.Dsyr2k.dsyr2k(uplo, trans, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ org.netlib.blas.Dsyr2k.dsyr2k(
+ uplo, trans, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc) {
+ org.netlib.blas.Ssyr2k.ssyr2k(uplo, trans, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ org.netlib.blas.Ssyr2k.ssyr2k(
+ uplo, trans, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int lda, double beta,
+ double[] c, int ldc) {
+ org.netlib.blas.Dsyrk.dsyrk(uplo, trans, n, k, alpha, a, 0, lda, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda,
+ double beta, double[] c, int cOffset, int ldc) {
+ org.netlib.blas.Dsyrk.dsyrk(uplo, trans, n, k, alpha, a, aOffset, lda, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int lda, float beta,
+ float[] c, int ldc) {
+ org.netlib.blas.Ssyrk.ssyrk(uplo, trans, n, k, alpha, a, 0, lda, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda,
+ float beta, float[] c, int cOffset, int ldc) {
+ org.netlib.blas.Ssyrk.ssyrk(uplo, trans, n, k, alpha, a, aOffset, lda, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int lda, double[] b, int ldb) {
+ org.netlib.blas.Dtrmm.dtrmm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb);
+ }
+
+ @Override
+ public void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha, double[] a,
+ int aOffset, int lda, double[] b, int bOffset, int ldb) {
+ org.netlib.blas.Dtrmm.dtrmm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb);
+ }
+
+ @Override
+ public void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha,
+ float[] a, int lda, float[] b, int ldb) {
+ org.netlib.blas.Strmm.strmm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb);
+ }
+
+ @Override
+ public void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int aOffset, int lda, float[] b, int bOffset, int ldb) {
+ org.netlib.blas.Strmm.strmm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb);
+ }
+
+ @Override
+ public void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha,
+ double[] a, int lda, double[] b, int ldb) {
+ org.netlib.blas.Dtrsm.dtrsm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb);
+ }
+
+ @Override
+ public void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha, double[] a,
+ int aOffset, int lda, double[] b, int bOffset, int ldb) {
+ org.netlib.blas.Dtrsm.dtrsm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb);
+ }
+
+ @Override
+ public void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha,
+ float[] a, int lda, float[] b, int ldb) {
+ org.netlib.blas.Strsm.strsm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb);
+ }
+
+ @Override
+ public void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a,
+ int aOffset, int lda, float[] b, int bOffset, int ldb) {
+ org.netlib.blas.Strsm.strsm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb);
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java
new file mode 100644
index 0000000000000000000000000000000000000000..6b82957276919d60be9ff0f87b9de07696d05010
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java
@@ -0,0 +1,424 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas;
+
+import com.huawei.vectorblas.blas1.doubleprecision.Dasum;
+import com.huawei.vectorblas.blas1.doubleprecision.Daxpy;
+import com.huawei.vectorblas.blas1.doubleprecision.Dcopy;
+import com.huawei.vectorblas.blas1.doubleprecision.Ddot;
+import com.huawei.vectorblas.blas1.doubleprecision.Dnrm2;
+import com.huawei.vectorblas.blas1.doubleprecision.Drot;
+import com.huawei.vectorblas.blas1.doubleprecision.Drotm;
+import com.huawei.vectorblas.blas1.doubleprecision.Dscal;
+import com.huawei.vectorblas.blas1.doubleprecision.Dswap;
+import com.huawei.vectorblas.blas1.doubleprecision.Idamax;
+import com.huawei.vectorblas.blas1.singleprecision.Isamax;
+import com.huawei.vectorblas.blas1.singleprecision.Sasum;
+import com.huawei.vectorblas.blas1.singleprecision.Saxpy;
+import com.huawei.vectorblas.blas1.singleprecision.Scopy;
+import com.huawei.vectorblas.blas1.singleprecision.Sdot;
+import com.huawei.vectorblas.blas1.singleprecision.Snrm2;
+import com.huawei.vectorblas.blas1.singleprecision.Srot;
+import com.huawei.vectorblas.blas1.singleprecision.Srotm;
+import com.huawei.vectorblas.blas1.singleprecision.Sscal;
+import com.huawei.vectorblas.blas1.singleprecision.Sswap;
+import com.huawei.vectorblas.blas2.doubleprecision.Dgemv;
+import com.huawei.vectorblas.blas2.doubleprecision.Dger;
+import com.huawei.vectorblas.blas2.doubleprecision.Dspmv;
+import com.huawei.vectorblas.blas2.doubleprecision.Dspr;
+import com.huawei.vectorblas.blas2.doubleprecision.Dsymv;
+import com.huawei.vectorblas.blas2.singleprecision.Sgemv;
+import com.huawei.vectorblas.blas2.singleprecision.Sger;
+import com.huawei.vectorblas.blas2.singleprecision.Sspmv;
+import com.huawei.vectorblas.blas2.singleprecision.Sspr;
+import com.huawei.vectorblas.blas2.singleprecision.Ssymv;
+import com.huawei.vectorblas.blas3.doubleprecision.Dgemm;
+import com.huawei.vectorblas.blas3.doubleprecision.Dsymm;
+import com.huawei.vectorblas.blas3.singleprecision.Sgemm;
+import com.huawei.vectorblas.blas3.singleprecision.Ssymm;
+
+public class VectorBLAS extends F2jBLAS {
+ @Override
+ public double dasum(int n, double[] x, int incx) {
+ return Dasum.dasum(n, x, 0, incx);
+ }
+
+ @Override
+ public double dasum(int n, double[] x, int xOffset, int incx) {
+ return Dasum.dasum(n, x, xOffset, incx);
+ }
+
+ @Override
+ public float sasum(int n, float[] x, int incx) {
+ return Sasum.sasum(n, x, 0, incx);
+ }
+
+ @Override
+ public float sasum(int n, float[] x, int xOffset, int incx) {
+ return Sasum.sasum(n, x, xOffset, incx);
+ }
+
+ @Override
+ public void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy) {
+ Daxpy.daxpy(n, alpha, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ Daxpy.daxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy) {
+ Saxpy.saxpy(n, alpha, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ Saxpy.saxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void dcopy(int n, double[] x, int incx, double[] y, int incy) {
+ Dcopy.dcopy(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ Dcopy.dcopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void scopy(int n, float[] x, int incx, float[] y, int incy) {
+ Scopy.scopy(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ Scopy.scopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public double ddot(int n, double[] x, int incx, double[] y, int incy) {
+ return Ddot.ddot(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ return Ddot.ddot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public float sdot(int n, float[] x, int incx, float[] y, int incy) {
+ return Sdot.sdot(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ return Sdot.sdot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public float snrm2(int n, float[] x, int incx) {
+ return Snrm2.snrm2(n, x, 0, incx);
+ }
+
+ @Override
+ public float snrm2(int n, float[] x, int xOffset, int incx) {
+ return Snrm2.snrm2(n, x, xOffset, incx);
+ }
+
+ @Override
+ public double dnrm2(int n, double[] x, int incx) {
+ return Dnrm2.dnrm2(n, x, 0, incx);
+ }
+
+ @Override
+ public double dnrm2(int n, double[] x, int xOffset, int incx) {
+ return Dnrm2.dnrm2(n, x, xOffset, incx);
+ }
+
+ @Override
+ public void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s) {
+ Srot.srot(n, x, 0, incx, y, 0, incy, c, s);
+ }
+
+ @Override
+ public void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s) {
+ Srot.srot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+
+ @Override
+ public void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s) {
+ Drot.drot(n, x, 0, incx, y, 0, incy, c, s);
+ }
+
+ @Override
+ public void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s) {
+ Drot.drot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+
+ @Override
+ public void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param) {
+ Srotm.srotm(n, x, 0, incx, y, 0, incy, param, 0);
+ }
+
+ @Override
+ public void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param,
+ int paramOffset) {
+ Srotm.srotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset);
+ }
+
+ @Override
+ public void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param) {
+ Drotm.drotm(n, x, 0, incx, y, 0, incy, param, 0);
+ }
+
+ @Override
+ public void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param,
+ int paramOffset) {
+ Drotm.drotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset);
+ }
+
+ @Override
+ public void sscal(int n, float alp, float[] x, int incx) {
+ Sscal.sscal(n, alp, x, 0, incx);
+ }
+
+ @Override
+ public void sscal(int n, float alp, float[] x, int xOffset, int incx) {
+ Sscal.sscal(n, alp, x, xOffset, incx);
+ }
+
+ @Override
+ public void dscal(int n, double alp, double[] x, int incx) {
+ Dscal.dscal(n, alp, x, 0, incx);
+ }
+
+ @Override
+ public void dscal(int n, double alp, double[] x, int xOffset, int incx) {
+ Dscal.dscal(n, alp, x, xOffset, incx);
+ }
+
+ @Override
+ public void sswap(int n, float[] x, int incx, float[] y, int incy) {
+ Sswap.sswap(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ Sswap.sswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public void dswap(int n, double[] x, int incx, double[] y, int incy) {
+ Dswap.dswap(n, x, 0, incx, y, 0, incy);
+ }
+
+ @Override
+ public void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ Dswap.dswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ @Override
+ public int isamax(int n, float[] x, int incx) {
+ return Isamax.isamax(n, x, 0, incx);
+ }
+
+ @Override
+ public int isamax(int n, float[] x, int xOffset, int incx) {
+ return Isamax.isamax(n, x, xOffset, incx);
+ }
+
+ @Override
+ public int idamax(int n, double[] x, int incx) {
+ return Idamax.idamax(n, x, 0, incx);
+ }
+
+ @Override
+ public int idamax(int n, double[] x, int xOffset, int incx) {
+ return Idamax.idamax(n, x, xOffset, incx);
+ }
+
+ @Override
+ public void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ Dgemv.dgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ Dgemv.dgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ Sgemv.sgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy) {
+ Sgemv.sgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda) {
+ Dger.dger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy,
+ double[] a, int aOffset, int lda) {
+ Dger.dger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) {
+ Sger.sger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda);
+ }
+
+ @Override
+ public void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy,
+ float[] a, int aOffset, int lda) {
+ Sger.sger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+
+ @Override
+ public void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ Dspmv.dspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx,
+ double beta, double[] y, int yOffset, int incy) {
+ Dspmv.dspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ Sspmv.sspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx,
+ float beta, float[] y, int yOffset, int incy) {
+ Sspmv.sspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap) {
+ Dspr.dspr(uplo, n, alpha, x, 0, incx, ap, 0);
+ }
+
+ @Override
+ public void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset) {
+ Dspr.dspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset);
+ }
+
+ @Override
+ public void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap) {
+ Sspr.sspr(uplo, n, alpha, x, 0, incx, ap, 0);
+ }
+
+ @Override
+ public void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset) {
+ Sspr.sspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset);
+ }
+
+ @Override
+ public void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta,
+ double[] y, int incy) {
+ Dsymv.dsymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, int xOffset,
+ int incx, double beta, double[] y, int yOffset, int incy) {
+ Dsymv.dsymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta,
+ float[] y, int incy) {
+ Ssymv.ssymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+ }
+
+ @Override
+ public void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ int incx, float beta, float[] y, int yOffset, int incy) {
+ Ssymv.ssymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy);
+ }
+
+ @Override
+ public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc) {
+ Dgemm.dgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset,
+ int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ Dgemm.dgemm(transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc) {
+ Sgemm.sgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset,
+ int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ Sgemm.sgemm(transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda,
+ double[] b, int ldb, double beta, double[] c, int ldc) {
+ Dsymm.dsymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ Dsymm.dsymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+
+ @Override
+ public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda,
+ float[] b, int ldb, float beta, float[] c, int ldc) {
+ Ssymm.ssymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+ }
+
+ @Override
+ public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ Ssymm.ssymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc);
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java
new file mode 100644
index 0000000000000000000000000000000000000000..2d9f6789f11fc7f2d1f3a798d7b73224da5080af
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dasum {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+ public static double dasum(int n, double[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1) {
+ return 0.0;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ return vecDasum(n, x, xOffset);
+ }
+ return norDasum(n, x, xOffset, incx);
+ }
+
+ private static double vecDasum(int n, double[] x, int xOffset) {
+ int xIndex = 0;
+ DoubleVector resVec = DoubleVector.zero(DSPECIES);
+ int xLoopBound = DSPECIES.loopBound(n);
+ for (; xIndex < xLoopBound; xIndex += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xIndex + xOffset);
+ resVec = resVec.add(xv.abs());
+ }
+ double result = resVec.reduceLanes(VectorOperators.ADD);
+ for (; xIndex < n; xIndex++) {
+ result += Math.abs(x[xIndex + xOffset]);
+ }
+ return result;
+ }
+
+ private static double norDasum(int n, double[] x, int xOffset, int incx) {
+ double result = 0.0d;
+ int xIndex = 0;
+ for (int count = 0; count < n; count++) {
+ result += Math.abs(x[xIndex + xOffset]);
+ xIndex += incx;
+ }
+ return result;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java
new file mode 100644
index 0000000000000000000000000000000000000000..fea183b58f45250efcaa51b00c64836b1db0c0b1
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Daxpy {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset,
+ int incy) {
+ if (n < 1 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecDaxpy(n, alpha, x, xOffset, y, yOffset);
+ } else {
+ norDaxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void vecDaxpy(int n, double alpha, double[] x, int xOffset, double[] y, int yOffset) {
+ DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha);
+ int index = 0;
+ int loopBound = DSPECIES.loopBound(n);
+ for (; index < loopBound; index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+ alphaVec.fma(xv, yv).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ y[index + yOffset] += alpha * x[index + xOffset];
+ }
+ }
+
+ private static void norDaxpy(int n, double alpha, double[] x, int xOffset, int incx,
+ double[] y, int yOffset, int incy) {
+ int xIndex = incx >= 0 ? 0 : (n - 1) * -incx;
+ int yIndex = incy >= 0 ? 0 : (n - 1) * -incy;
+ for (int count = 0; count < n; count++) {
+ y[yIndex + yOffset] += alpha * x[xIndex + xOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java
new file mode 100644
index 0000000000000000000000000000000000000000..3ffc363604dab93fff480a5176f07a18b8f128cb
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+public class Dcopy {
+ public static void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ if (n <= 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if ((incx == 1 && incy == 1) || (incx == -1 && incy == -1)) {
+ System.arraycopy(x, xOffset, y, yOffset, n);
+ } else {
+ norDcopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void norDcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ int xInitIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yInitIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int i = n; i > 0; --i) {
+ y[yInitIndex + yOffset] = x[xInitIndex + xOffset];
+ xInitIndex += incx;
+ yInitIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java
new file mode 100644
index 0000000000000000000000000000000000000000..b9742dfdf7c15851b95b713cc90e26d3384f1288
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Ddot {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ if (n < 1) {
+ return 0.0d;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ if (xOffset == 0 && yOffset == 0) {
+ return vecDdot(n, x, y);
+ }
+ return vecDdot(n, x, xOffset, y, yOffset);
+ }
+ return norDdot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ private static double vecDdot(int n, double[] x, double[] y) {
+ int index = 0;
+ DoubleVector sumVec = DoubleVector.zero(DSPECIES);
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, x, index);
+ DoubleVector bv = DoubleVector.fromArray(DSPECIES, y, index);
+ sumVec = av.fma(bv, sumVec);
+ }
+ double sum = sumVec.reduceLanes(VectorOperators.ADD);
+ for (; index < n; index++) {
+ sum += x[index] * y[index];
+ }
+ return sum;
+ }
+
+ private static double vecDdot(int n, double[] x, int xOffset, double[] y, int yOffset) {
+ int index = 0;
+ DoubleVector sumVec = DoubleVector.zero(DSPECIES);
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector bv = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+ sumVec = av.fma(bv, sumVec);
+ }
+ double sum = sumVec.reduceLanes(VectorOperators.ADD);
+ for (; index < n; index++) {
+ sum += x[index + xOffset] * y[index + yOffset];
+ }
+ return sum;
+ }
+
+ private static double norDdot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ int xIndex = incx >= 0 ? 0 : (n - 1) * -incx;
+ int yIndex = incy >= 0 ? 0 : (n - 1) * -incy;
+ double sum = 0.0d;
+ for (int count = 0; count < n; count++) {
+ sum += y[yIndex + yOffset] * x[xIndex + xOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ return sum;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java
new file mode 100644
index 0000000000000000000000000000000000000000..b6dd883161376cd63b1e3556f83fd9750122a223
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+public class Dnrm2 {
+ private static final int MINEXPONENT = -1021; // -1021 is the minimum exponent in the model of the type of double.
+ private static final int MAXEXPONENT = 1024; // 1024 is the maximum exponent in the model of the type of double.
+ private static final int DIGITS = 53; // 53 is the number of significant binary digits of double.
+ public static double dnrm2(int n, double[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1) {
+ return 0.0;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ return norDnrm2(n, x, xOffset, incx);
+ }
+
+ private static double norDnrm2(int n, double[] x, int xOffset, int incx) {
+ /*
+ * tSml, tBig, sSml, sBig are Blue's scaling constants.
+ */
+ double tSml = Math.pow(2, Math.ceil((MINEXPONENT - 1) * 0.5d));
+ double tBig = Math.pow(2, Math.floor((MAXEXPONENT - DIGITS + 1) * 0.5d));
+ double sSml = Math.pow(2, -1 * Math.floor((MINEXPONENT - DIGITS) * 0.5d));
+ double sBig = Math.pow(2, -1 * Math.ceil((MAXEXPONENT + DIGITS - 1) * 0.5d));
+ boolean notBig = true;
+ double aSml = 0.0d;
+ double aMed = 0.0d;
+ double aBig = 0.0d;
+
+ int xIndex = 0;
+ for (int count = 0; count < n; count++) {
+ double ax = Math.abs(x[xOffset + xIndex]);
+ if (ax > tBig) {
+ aBig += (ax * sBig) * (ax * sBig);
+ notBig = false;
+ } else if (ax < tSml) {
+ if (notBig) {
+ aSml += (ax * sSml) * (ax * sSml);
+ }
+ } else {
+ aMed += ax * ax;
+ }
+ xIndex += incx;
+ }
+
+ double maxN = Double.MAX_VALUE;
+ double scaleVal;
+ double sumSq;
+ if (aBig > 0.0) {
+ if ((aMed > 0.0) || (aMed > maxN) || (Double.compare(aMed, aMed) != 0)) {
+ aBig += (aMed * sBig) * sBig;
+ }
+ scaleVal = 1.0d / sBig;
+ sumSq = aBig;
+ } else if (aSml > 0.0) {
+ if ((aMed > 0.0) || (aMed > maxN) || (Double.compare(aMed, aMed) != 0)) {
+ aMed = Math.sqrt(aMed);
+ aSml = Math.sqrt(aSml) / sSml;
+ double yMin = aSml > aMed ? aMed : aSml;
+ double yMax = aSml > aMed ? aSml : aMed;
+ scaleVal = 1.0d;
+ double yMinDevideMax = yMin / yMax;
+ sumSq = yMax * yMax * (1.0d + yMinDevideMax * yMinDevideMax);
+ } else {
+ scaleVal = 1.0d / sSml;
+ sumSq = aSml;
+ }
+ } else {
+ scaleVal = 1.0d;
+ sumSq = aMed;
+ }
+ return scaleVal * Math.sqrt(sumSq);
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java
new file mode 100644
index 0000000000000000000000000000000000000000..bd6ba3a34ead3d60a02593c52d4b95c7b42579e3
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Drot {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c,
+ double s) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecDrot(n, x, xOffset, y, yOffset, c, s);
+ } else {
+ norDrot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+ }
+
+ private static void vecDrot(int n, double[] x, int xOffset, double[] y, int yOffset, double c, double s) {
+ DoubleVector cv = DoubleVector.broadcast(DSPECIES, c);
+ DoubleVector sv = DoubleVector.broadcast(DSPECIES, s);
+ DoubleVector nsv = DoubleVector.broadcast(DSPECIES, -s);
+ int index = 0;
+ int idxLoopBound = loopBound(n, DSPECIES.length() * 4);
+ for (; index < idxLoopBound; index += DSPECIES.length() * 4) {
+ DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() + xOffset);
+ DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() * 2 + xOffset);
+ DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() * 3 + xOffset);
+
+ DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+ DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() + yOffset);
+ DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() * 2 + yOffset);
+ DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() * 3 + yOffset);
+
+ xv0.fma(cv, yv0.mul(sv)).intoArray(x, index + xOffset);
+ xv1.fma(cv, yv1.mul(sv)).intoArray(x, index + DSPECIES.length() + xOffset);
+ xv2.fma(cv, yv2.mul(sv)).intoArray(x, index + DSPECIES.length() * 2 + xOffset);
+ xv3.fma(cv, yv3.mul(sv)).intoArray(x, index + DSPECIES.length() * 3 + xOffset);
+
+ xv0.fma(nsv, yv0.mul(cv)).intoArray(y, index + yOffset);
+ xv1.fma(nsv, yv1.mul(cv)).intoArray(y, index + DSPECIES.length() + yOffset);
+ xv2.fma(nsv, yv2.mul(cv)).intoArray(y, index + DSPECIES.length() * 2 + yOffset);
+ xv3.fma(nsv, yv3.mul(cv)).intoArray(y, index + DSPECIES.length() * 3 + yOffset);
+ }
+ for (; index < DSPECIES.loopBound(n); index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+
+ xv.fma(cv, yv.mul(sv)).intoArray(x, index + xOffset);
+ xv.fma(nsv, yv.mul(cv)).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ double tmp = x[index + xOffset];
+ x[index + xOffset] = c * x[index + xOffset] + s * y[index + yOffset];
+ y[index + yOffset] = c * y[index + yOffset] - s * tmp;
+ }
+ }
+
+ private static void norDrot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy,
+ double c, double s) {
+ int xInitIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yInitIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int num = n; num > 0; --num) {
+ double tmp = x[xInitIndex + xOffset];
+ x[xInitIndex + xOffset] = c * tmp + s * y[yInitIndex + yOffset];
+ y[yInitIndex + yOffset] = -s * tmp + c * y[yInitIndex + yOffset];
+ xInitIndex += incx;
+ yInitIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java
new file mode 100644
index 0000000000000000000000000000000000000000..592e76e765e4911a6c8b131678142fe6772cfbe2
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Drotm {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy,
+ double[] param, int paramOffset) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("param", paramOffset, 4, param.length);
+ if (incx == 1 && incy == 1) {
+ vecDrotm(n, x, xOffset, y, yOffset, param, paramOffset);
+ } else {
+ norDrotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset);
+ }
+ }
+
+ private static void vecDrotm(int n, double[] x, int xOffset, double[] y, int yOffset, double[] param,
+ int paramOffset) {
+ double flag = param[paramOffset];
+ if (Double.compare(flag, -2.0d) == 0) { // If flag equals -2.0, do nothing and return directly.
+ return;
+ }
+ double h11 = param[paramOffset + 1];
+ double h12 = 1.0d;
+ double h21 = -1.0d;
+ double h22 = param[paramOffset + 4];
+ if (Double.compare(flag, -1.0d) == 0) {
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ } else if (BlasUtils.isZero(flag)) {
+ h11 = 1.0d;
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ h22 = 1.0d;
+ }
+ DoubleVector h11v = DoubleVector.broadcast(DSPECIES, h11);
+ DoubleVector h12v = DoubleVector.broadcast(DSPECIES, h12);
+ DoubleVector h21v = DoubleVector.broadcast(DSPECIES, h21);
+ DoubleVector h22v = DoubleVector.broadcast(DSPECIES, h22);
+ int index = 0;
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+ (xv.mul(h11v)).add(yv.mul(h12v)).intoArray(x, index + xOffset);
+ (xv.mul(h21v)).add(yv.mul(h22v)).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ double xTmp = x[index + xOffset];
+ x[index + xOffset] = h11 * xTmp + h12 * y[index + yOffset];
+ y[index + yOffset] = h21 * xTmp + h22 * y[index + yOffset];
+ }
+ }
+
+ private static void norDrotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy,
+ double[] param, int paramOffset) {
+ double flag = param[paramOffset];
+ if (Double.compare(flag, -2.0d) == 0) { // If flag equals -2.0, do nothing and return directly.
+ return;
+ }
+ double h11 = param[paramOffset + 1];
+ double h12 = 1.0d;
+ double h21 = -1.0d;
+ double h22 = param[paramOffset + 4];
+ if (Double.compare(flag, -1.0d) == 0) {
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ } else if (BlasUtils.isZero(flag)) {
+ h11 = 1.0d;
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ h22 = 1.0d;
+ }
+ int xInitIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yInitIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int num = n; num > 0; --num) {
+ double xTmp = x[xInitIndex + xOffset];
+ x[xInitIndex + xOffset] = h11 * xTmp + h12 * y[yInitIndex + yOffset];
+ y[yInitIndex + yOffset] = h21 * xTmp + h22 * y[yInitIndex + yOffset];
+ xInitIndex += incx;
+ yInitIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java
new file mode 100644
index 0000000000000000000000000000000000000000..a6c78751990e9afb2fb741f8602cf3e1d9b1ffcd
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dscal {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void dscal(int n, double alpha, double[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1 || Double.compare(alpha, 1.0) == 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ vecDscal(n, alpha, x, xOffset);
+ } else {
+ norDscal(n, alpha, x, xOffset, incx);
+ }
+ }
+
+ private static void vecDscal(int n, double alpha, double[] x, int xOffset) {
+ DoubleVector alpv = DoubleVector.broadcast(DSPECIES, alpha);
+ int index = 0;
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ xv.mul(alpv).intoArray(x, index + xOffset);
+ }
+ for (; index < n; index++) {
+ x[index + xOffset] *= alpha;
+ }
+ }
+
+ private static void norDscal(int n, double alpha, double[] x, int xOffset, int incx) {
+ int xInitIndex = 0;
+ for (int num = 0; num < n; num++) {
+ x[xInitIndex + xOffset] = alpha * x[xInitIndex + xOffset];
+ xInitIndex += incx;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java
new file mode 100644
index 0000000000000000000000000000000000000000..f9bcd33108117a20c368e072504e09578f62a383
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dswap {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecDswap(n, x, xOffset, y, yOffset);
+ } else {
+ norDswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void vecDswap(int n, double[] x, int xOffset, double[] y, int yOffset) {
+ int index = 0;
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset);
+ xv.intoArray(y, index + yOffset);
+ yv.intoArray(x, index + xOffset);
+ }
+ for (; index < n; index++) {
+ double tmp = x[index + xOffset];
+ x[index + xOffset] = y[index + yOffset];
+ y[index + yOffset] = tmp;
+ }
+ }
+
+ private static void norDswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ int xIndex = incx < 0 ? (-n + 1) * incx + 1 : 1;
+ int yIndex = incy < 0 ? (-n + 1) * incy + 1 : 1;
+ for (int num = n; num > 0; --num) {
+ double tmp = x[xIndex - 1 + xOffset];
+ x[xIndex - 1 + xOffset] = y[yIndex - 1 + yOffset];
+ y[yIndex - 1 + yOffset] = tmp;
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java
new file mode 100644
index 0000000000000000000000000000000000000000..6da1ee69e8cbd390d2b7f2b603944c75b4d87a3f
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Idamax {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static int idamax(int n, double[] x, int xOffset, int incx) {
+ if (n <= 0 || incx <= 0) {
+ return 0;
+ }
+ if (n == 1) {
+ return 1;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ return vecIdamax(n, x, xOffset);
+ } else {
+ return norIdamax(n, x, xOffset, incx);
+ }
+ }
+
+ private static int vecIdamax(int n, double[] x, int xOffset) {
+ int indexOfMaxVec = 0;
+ double max = 0.0d;
+ int index = 0;
+ int idxLoopBound = DSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset);
+ double maxOfLanes = xv.abs().reduceLanes(VectorOperators.MAX);
+ if (max < maxOfLanes) {
+ max = maxOfLanes;
+ indexOfMaxVec = index;
+ }
+ }
+ int indexOfMaxValue = 0;
+ for (int j = indexOfMaxVec; j < indexOfMaxVec + DSPECIES.length(); j++) {
+ if (max <= Math.abs(x[j + xOffset])) {
+ indexOfMaxValue = j + 1;
+ break;
+ }
+ }
+ for (; index < n; index++) {
+ if (max < Math.abs(x[index + xOffset])) {
+ max = Math.abs(x[index + xOffset]);
+ indexOfMaxValue = index + 1;
+ }
+ }
+ return indexOfMaxValue;
+ }
+
+ private static int norIdamax(int n, double[] x, int xOffset, int incx) {
+ int indexOfMaxValue = 1;
+ double max = Math.abs(x[xOffset]);
+ int xIndex = incx;
+ for (int j = 2; j <= n; ++j) {
+ double value = Math.abs(x[xIndex + xOffset]);
+ if (value > max) {
+ indexOfMaxValue = j;
+ max = value;
+ }
+ xIndex += incx;
+ }
+ return indexOfMaxValue;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java
new file mode 100644
index 0000000000000000000000000000000000000000..88b7e735dae805fd89f9db1f9022908f09c8fca1
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Isamax {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static int isamax(int n, float[] x, int xOffset, int incx) {
+ if (n <= 0 || incx <= 0) {
+ return 0;
+ }
+ if (n == 1) {
+ return 1;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ return vecIsamax(n, x, xOffset);
+ } else {
+ return norIsamax(n, x, xOffset, incx);
+ }
+ }
+
+ private static int vecIsamax(int n, float[] x, int xOffset) {
+ float max = 0.0f;
+ int indexOfMaxVec = 0;
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ float maxOfLanes = xv.abs().reduceLanes(VectorOperators.MAX);
+ if (max < maxOfLanes) {
+ max = maxOfLanes;
+ indexOfMaxVec = index;
+ }
+ }
+ int indexOfMaxValue = 0;
+ for (int j = indexOfMaxVec; j < indexOfMaxVec + SSPECIES.length(); j++) {
+ if (max <= Math.abs(x[j + xOffset])) {
+ indexOfMaxValue = j + 1;
+ break;
+ }
+ }
+ for (; index < n; index++) {
+ if (max < Math.abs(x[index + xOffset])) {
+ max = Math.abs(x[index + xOffset]);
+ indexOfMaxValue = index + 1;
+ }
+ }
+ return indexOfMaxValue;
+ }
+
+ private static int norIsamax(int n, float[] x, int xOffset, int incx) {
+ int indexOfMaxValue = 1;
+ float max = Math.abs(x[xOffset]);
+ int xIndex = incx;
+ for (int j = 2; j <= n; j++) {
+ float val = Math.abs(x[xIndex + xOffset]);
+ if (val > max) {
+ indexOfMaxValue = j;
+ max = val;
+ }
+ xIndex += incx;
+ }
+ return indexOfMaxValue;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java
new file mode 100644
index 0000000000000000000000000000000000000000..4574795989599d8379e9c6b0a774db0760c0e793
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sasum {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static float sasum(int n, float[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1) {
+ return 0.0f;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ return vecSasum(n, x, xOffset);
+ }
+ return norSasum(n, x, xOffset, incx);
+ }
+
+ private static float vecSasum(int n, float[] x, int xOffset) {
+ int xIndex = 0;
+ FloatVector resVec = FloatVector.zero(SSPECIES);
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; xIndex < idxLoopBound; xIndex += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, xIndex + xOffset);
+ resVec = resVec.add(xv.abs());
+ }
+ float result = resVec.reduceLanes(VectorOperators.ADD);
+ for (; xIndex < n; xIndex++) {
+ result += Math.abs(x[xIndex + xOffset]);
+ }
+ return result;
+ }
+
+ private static float norSasum(int n, float[] x, int xOffset, int incx) {
+ float result = 0.0f;
+ int xIndex = 0;
+ for (int count = 0; count < n; count++) {
+ result += Math.abs(x[xIndex + xOffset]);
+ xIndex += incx;
+ }
+ return result;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java
new file mode 100644
index 0000000000000000000000000000000000000000..5dd3675a29f1dab6361c9f50be4289c65b99113a
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Saxpy {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ if (n < 1 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecSaxpy(n, alpha, x, xOffset, y, yOffset);
+ } else {
+ norSaxpy(n, alpha, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void vecSaxpy(int n, float alpha, float[] x, int xOffset, float[] y, int yOffset) {
+ FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha);
+ int index = 0;
+ int idxLoopBound = loopBound(n, (SSPECIES.length() * 4));
+ for (; index < idxLoopBound; index += SSPECIES.length() * 4) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 2 + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 3 + xOffset);
+
+ FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() + yOffset);
+ FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 2 + yOffset);
+ FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 3 + yOffset);
+
+ xv0.fma(alphaVec, yv0).intoArray(y, index + yOffset);
+ xv1.fma(alphaVec, yv1).intoArray(y, index + SSPECIES.length() + yOffset);
+ xv2.fma(alphaVec, yv2).intoArray(y, index + SSPECIES.length() * 2 + yOffset);
+ xv3.fma(alphaVec, yv3).intoArray(y, index + SSPECIES.length() * 3 + yOffset);
+ }
+ for (; index < SSPECIES.loopBound(n); index += SSPECIES.length()) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ xv0.fma(alphaVec, yv0).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ y[index + yOffset] += alpha * x[index + xOffset];
+ }
+ }
+
+ private static void norSaxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy) {
+ int xIndex = incx >= 0 ? 0 : (n - 1) * -incx;
+ int yIndex = incy >= 0 ? 0 : (n - 1) * -incy;
+ for (int count = 0; count < n; count++) {
+ y[yIndex + yOffset] += alpha * x[xIndex + xOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java
new file mode 100644
index 0000000000000000000000000000000000000000..7393bf3f7542237138e3ba7b4744d1c100df61e4
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+public class Scopy {
+ public static void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ if (n <= 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ System.arraycopy(x, xOffset, y, yOffset, n);
+ } else {
+ norScopy(n, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void norScopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ int xIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int i = n; i > 0; --i) {
+ y[yIndex + yOffset] = x[xIndex + xOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java
new file mode 100644
index 0000000000000000000000000000000000000000..6a8f8343fd5aa659e8c5dc34b390483f15eaa6c4
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sdot {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ if (n < 1) {
+ return 0.0f;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ if (xOffset == 0 && yOffset == 0) {
+ return vecSdot(n, x, y);
+ }
+ return vecSdot(n, x, xOffset, y, yOffset);
+ }
+ return norSdot(n, x, xOffset, incx, y, yOffset, incy);
+ }
+
+ private static float vecSdot(int n, float[] x, float[] y) {
+ FloatVector sumVec = FloatVector.zero(SSPECIES);
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, x, index);
+ FloatVector bv = FloatVector.fromArray(SSPECIES, y, index);
+ sumVec = av.fma(bv, sumVec);
+ }
+ float sum = sumVec.reduceLanes(VectorOperators.ADD);
+ for (; index < n; index++) {
+ sum += x[index] * y[index];
+ }
+ return sum;
+ }
+
+ private static float vecSdot(int n, float[] x, int xOffset, float[] y, int yOffset) {
+ FloatVector sumVec = FloatVector.zero(SSPECIES);
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector bv = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ sumVec = av.fma(bv, sumVec);
+ }
+ float sum = sumVec.reduceLanes(VectorOperators.ADD);
+ for (; index < n; index++) {
+ sum += x[index + xOffset] * y[index + yOffset];
+ }
+ return sum;
+ }
+
+ private static float norSdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ int xIndex = incx >= 0 ? 0 : (n - 1) * -incx;
+ int yIndex = incy >= 0 ? 0 : (n - 1) * -incy;
+ float sum = 0.0f;
+ for (int count = 0; count < n; count++) {
+ sum += y[yIndex + yOffset] * x[xIndex + xOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ return sum;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java
new file mode 100644
index 0000000000000000000000000000000000000000..9fad7f6b4ba43e724356652e317fe4314ffdc56c
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+public class Snrm2 {
+ private static final int MINEXPONENT = -125; // -125 is the minimum exponent in the model of the type of float.
+ private static final int MAXEXPONENT = 128; // 128 is the maximum exponent in the model of the type of float.
+ private static final int DIGITS = 24; // 24 is the number of significant binary digits of float.
+ public static float snrm2(int n, float[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1) {
+ return 0.0f;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ return norSnrm2(n, x, xOffset, incx);
+ }
+
+ private static float norSnrm2(int n, float[] x, int xOffset, int incx) {
+ /*
+ * tSml, tBig, sSml, sBig are Blue's scaling constants.
+ */
+ float tSml = (float) Math.pow(2, Math.ceil((MINEXPONENT - 1) * 0.5f));
+ float tBig = (float) Math.pow(2, Math.floor((MAXEXPONENT - DIGITS + 1) * 0.5f));
+ float sSml = (float) Math.pow(2, -1 * Math.floor((MINEXPONENT - DIGITS) * 0.5f));
+ float sBig = (float) Math.pow(2, -1 * Math.ceil((MAXEXPONENT + DIGITS - 1) * 0.5f));
+ boolean notBig = true;
+ float aSml = 0.0f;
+ float aMed = 0.0f;
+ float aBig = 0.0f;
+
+ int xIndex = 0;
+ for (int count = 0; count < n; count++) {
+ float ax = Math.abs(x[xOffset + xIndex]);
+ if (ax > tBig) {
+ aBig += (ax * sBig) * (ax * sBig);
+ notBig = false;
+ } else if (ax < tSml) {
+ if (notBig) {
+ aSml += (ax * sSml) * (ax * sSml);
+ }
+ } else {
+ aMed += ax * ax;
+ }
+ xIndex += incx;
+ }
+
+ float maxN = Float.MAX_VALUE;
+ float scaleVal;
+ float sumSq;
+ if (aBig > 0.0) {
+ if ((aMed > 0.0) || (aMed > maxN) || (Float.compare(aMed, aMed) != 0)) {
+ aBig += (aMed * sBig) * sBig;
+ }
+ scaleVal = 1.0f / sBig;
+ sumSq = aBig;
+ } else if (aSml > 0.0) {
+ if ((aMed > 0.0) || (aMed > maxN) || (Float.compare(aMed, aMed) != 0)) {
+ aMed = (float) Math.sqrt(aMed);
+ aSml = (float) Math.sqrt(aSml) / sSml;
+ float yMin = aSml > aMed ? aMed : aSml;
+ float yMax = aSml > aMed ? aSml : aMed;
+ scaleVal = 1.0f;
+ float yMinDevideMax = yMin / yMax;
+ sumSq = yMax * yMax * (1.0f + yMinDevideMax * yMinDevideMax);
+ } else {
+ scaleVal = 1.0f / sSml;
+ sumSq = aSml;
+ }
+ } else {
+ scaleVal = 1.0f;
+ sumSq = aMed;
+ }
+ return scaleVal * (float) Math.sqrt(sumSq);
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java
new file mode 100644
index 0000000000000000000000000000000000000000..240af85925d0537580a5fd1cb472e17144a3fdc2
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Srot {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c,
+ float s) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecSrot(n, x, xOffset, y, yOffset, c, s);
+ } else {
+ norSrot(n, x, xOffset, incx, y, yOffset, incy, c, s);
+ }
+ }
+
+ private static void norSrot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy,
+ float c, float s) {
+ int xIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int num = n; num > 0; --num) {
+ float tmp = x[xIndex + xOffset];
+ x[xIndex + xOffset] = c * tmp + s * y[yIndex + yOffset];
+ y[yIndex + yOffset] = -s * tmp + c * y[yIndex + yOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+
+ private static void vecSrot(int n, float[] x, int xOffset, float[] y, int yOffset, float c, float s) {
+ FloatVector cv = FloatVector.broadcast(SSPECIES, c);
+ FloatVector sv = FloatVector.broadcast(SSPECIES, s);
+ FloatVector nsv = FloatVector.broadcast(SSPECIES, -s);
+ int index = 0;
+ int idxLoopBound = loopBound(n, SSPECIES.length() * 4);
+ for (; index < idxLoopBound; index += SSPECIES.length() * 4) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 2 + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 3 + xOffset);
+
+ FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() + yOffset);
+ FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 2 + yOffset);
+ FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 3 + yOffset);
+
+ xv0.fma(cv, yv0.mul(sv)).intoArray(x, index + xOffset);
+ xv1.fma(cv, yv1.mul(sv)).intoArray(x, index + SSPECIES.length() + xOffset);
+ xv2.fma(cv, yv2.mul(sv)).intoArray(x, index + SSPECIES.length() * 2 + xOffset);
+ xv3.fma(cv, yv3.mul(sv)).intoArray(x, index + SSPECIES.length() * 3 + xOffset);
+
+ xv0.fma(nsv, yv0.mul(cv)).intoArray(y, index + yOffset);
+ xv1.fma(nsv, yv1.mul(cv)).intoArray(y, index + SSPECIES.length() + yOffset);
+ xv2.fma(nsv, yv2.mul(cv)).intoArray(y, index + SSPECIES.length() * 2 + yOffset);
+ xv3.fma(nsv, yv3.mul(cv)).intoArray(y, index + SSPECIES.length() * 3 + yOffset);
+ }
+ for (; index < SSPECIES.loopBound(n); index += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ xv.fma(cv, yv.mul(sv)).intoArray(x, index + xOffset);
+ xv.fma(nsv, yv.mul(cv)).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ float tmp = x[index + xOffset];
+ x[index + xOffset] = c * tmp + s * y[index + yOffset];
+ y[index + yOffset] = c * y[index + yOffset] - s * tmp;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java
new file mode 100644
index 0000000000000000000000000000000000000000..2005a3fd81f8b5befa5a666cb82ee112ee525fee
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Srotm {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param,
+ int paramOffset) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("param", paramOffset, 4, param.length);
+ if (incx == 1 && incy == 1) {
+ vecSrotm(n, x, xOffset, y, yOffset, param, paramOffset);
+ } else {
+ norSrotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset);
+ }
+ }
+
+ private static void norSrotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy,
+ float[] param, int paramOffset) {
+ float flag = param[paramOffset];
+ if (Float.compare(flag, -2.0f) == 0) { // If flag equals -2.0, do nothing and return directly.
+ return;
+ }
+ float h11 = param[paramOffset + 1];
+ float h12 = 1.0f;
+ float h21 = -1.0f;
+ float h22 = param[paramOffset + 4];
+ if (Float.compare(flag, -1.0f) == 0) {
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ } else if (BlasUtils.isZero(flag)) {
+ h11 = 1.0f;
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ h22 = 1.0f;
+ }
+ int xIndex = incx < 0 ? (-n + 1) * incx : 0;
+ int yIndex = incy < 0 ? (-n + 1) * incy : 0;
+ for (int num = n; num > 0; --num) {
+ float xTmp = x[xIndex + xOffset];
+ x[xIndex + xOffset] = h11 * xTmp + h12 * y[yIndex + yOffset];
+ y[yIndex + yOffset] = h21 * xTmp + h22 * y[yIndex + yOffset];
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+
+ private static void vecSrotm(int n, float[] x, int xOffset, float[] y, int yOffset, float[] param,
+ int paramOffset) {
+ float flag = param[paramOffset];
+ if (Float.compare(flag, -2.0f) == 0) { // If flag equals -2.0, do nothing and return directly.
+ return;
+ }
+ float h11 = param[paramOffset + 1];
+ float h12 = 1.0f;
+ float h21 = -1.0f;
+ float h22 = param[paramOffset + 4];
+ if (Float.compare(flag, -1.0f) == 0) {
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ } else if (BlasUtils.isZero(flag)) {
+ h11 = 1.0f;
+ h12 = param[paramOffset + 3];
+ h21 = param[paramOffset + 2];
+ h22 = 1.0f;
+ }
+ FloatVector h11v = FloatVector.broadcast(SSPECIES, h11);
+ FloatVector h12v = FloatVector.broadcast(SSPECIES, h12);
+ FloatVector h21v = FloatVector.broadcast(SSPECIES, h21);
+ FloatVector h22v = FloatVector.broadcast(SSPECIES, h22);
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ (xv.mul(h11v)).add(yv.mul(h12v)).intoArray(x, index + xOffset);
+ (xv.mul(h21v)).add(yv.mul(h22v)).intoArray(y, index + yOffset);
+ }
+ for (; index < n; index++) {
+ float xTmp = x[index + xOffset];
+ x[index + xOffset] = h11 * xTmp + h12 * y[index + yOffset];
+ y[index + yOffset] = h21 * xTmp + h22 * y[index + yOffset];
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java
new file mode 100644
index 0000000000000000000000000000000000000000..b4571fc64bb8d6e273dcd0897ac00801fa9e871b
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sscal {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void sscal(int n, float alpha, float[] x, int xOffset, int incx) {
+ if (n < 1 || incx < 1 || Double.compare(alpha, 1.0) == 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ if (incx == 1) {
+ vecSscal(n, alpha, x, xOffset);
+ } else {
+ norSscal(n, alpha, x, xOffset, incx);
+ }
+ }
+
+ private static void vecSscal(int n, float alpha, float[] x, int xOffset) {
+ FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha);
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ xv.mul(alphaVec).intoArray(x, index + xOffset);
+ }
+ for (; index < n; index += 1) {
+ x[index + xOffset] *= alpha;
+ }
+ }
+
+ private static void norSscal(int n, float alpha, float[] x, int xOffset, int incx) {
+ int xIndex = 0;
+ for (int num = n; num > 0; --num) {
+ x[xIndex + xOffset] = alpha * x[xIndex + xOffset];
+ xIndex += incx;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java
new file mode 100644
index 0000000000000000000000000000000000000000..de9fd92cc7e390e75b393bca8cb6d9044904277b
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas1.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sswap {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ if (n < 1) {
+ return;
+ }
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ if (incx == 1 && incy == 1) {
+ vecSswap(n, x, xOffset, y, yOffset);
+ } else {
+ norSswap(n, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+
+ private static void vecSswap(int n, float[] x, int xOffset, float[] y, int yOffset) {
+ int index = 0;
+ int idxLoopBound = SSPECIES.loopBound(n);
+ for (; index < idxLoopBound; index += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset);
+ xv.intoArray(y, index + yOffset);
+ yv.intoArray(x, index + xOffset);
+ }
+ for (; index < n; index++) {
+ float tmp = x[index + xOffset];
+ x[index + xOffset] = y[index + yOffset];
+ y[index + yOffset] = tmp;
+ }
+ }
+
+ private static void norSswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ int xIndex = incx < 0 ? (-n + 1) * incx + 1 : 1;
+ int yIndex = incy < 0 ? (-n + 1) * incy + 1 : 1;
+ for (int num = n; num > 0; --num) {
+ float tmp = x[xIndex - 1 + xOffset];
+ x[xIndex - 1 + xOffset] = y[yIndex - 1 + yOffset];
+ y[yIndex - 1 + yOffset] = tmp;
+ xIndex += incx;
+ yIndex += incy;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java
new file mode 100644
index 0000000000000000000000000000000000000000..955e0bf19a38173096ec9f8ea47e7d47df555867
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class DblasLevel2 {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ protected static void dMulBeta(int size, double beta, double[] dy, int yOffset, int incy) {
+ if (incy == 1) {
+ DoubleVector betaVec = DoubleVector.broadcast(DSPECIES, beta);
+ int idx = 0;
+ int idxLoopBound = DSPECIES.loopBound(size);
+ for (; idx < idxLoopBound; idx += DSPECIES.length()) {
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, dy, idx + yOffset);
+ betaVec.mul(yv).intoArray(dy, idx + yOffset);
+ }
+ for (; idx < size; idx++) {
+ dy[idx + yOffset] = beta * dy[idx + yOffset];
+ }
+ } else {
+ int yIndex = incy >= 0 ? 0 : (1 - size) * incy;
+ if (BlasUtils.isZero(beta)) {
+ for (int i = 0; i < size; i++, yIndex += incy) {
+ dy[yIndex + yOffset] = 0.0d;
+ }
+ } else {
+ for (int i = 0; i < size; i++, yIndex += incy) {
+ dy[yIndex + yOffset] = beta * dy[yIndex + yOffset];
+ }
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java
new file mode 100644
index 0000000000000000000000000000000000000000..368105cdf74997898943f3860de33cd9f9c16dee
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java
@@ -0,0 +1,378 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dgemv {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("DGEMV", 1, Lsame.lsame(trans, "N") || Lsame.lsame(trans, "T"));
+ BlasUtils.checkParameter("DGEMV", 2, m >= 0);
+ BlasUtils.checkParameter("DGEMV", 3, n >= 0);
+ BlasUtils.checkParameter("DGEMV", 6, lda >= Math.max(1, m));
+ BlasUtils.checkParameter("DGEMV", 8, incx != 0);
+ BlasUtils.checkParameter("DGEMV", 11, incy != 0);
+ if (m == 0 || n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) {
+ return;
+ }
+ boolean transFlag = Lsame.lsame(trans, "N");
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * ((transFlag ? n : m) - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * ((transFlag ? m : n) - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (n - 1) * lda + m - 1, a.length);
+
+ if (Double.compare(beta, 1.0) != 0) {
+ DblasLevel2.dMulBeta(transFlag ? m : n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (transFlag) {
+ if (incy == 1) {
+ if (incx == 1) {
+ vecDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset);
+ } else {
+ vecDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset);
+ }
+ } else {
+ norDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy);
+ }
+ } else {
+ if (incx == 1) {
+ if (incy == 1) {
+ vecDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset);
+ } else {
+ vecDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset, incy);
+ }
+ } else {
+ norDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+ }
+
+ private static void vecDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, double[] y, int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ int rowUnrollLoopBound = loopBound(m, DSPECIES.length() * 4);
+ int rowLoopBound = loopBound(m, DSPECIES.length());
+ for (; col < colLoopBound; col += 4) {
+ DoubleVector xv0 = DoubleVector.broadcast(DSPECIES, alpha * x[col + xOffset]);
+ DoubleVector xv1 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 1 + xOffset]);
+ DoubleVector xv2 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 2 + xOffset]);
+ DoubleVector xv3 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 3 + xOffset]);
+ int row = 0;
+ for (; row < rowUnrollLoopBound; row += DSPECIES.length() * 4) {
+ DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() + yOffset);
+ DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 2 + yOffset);
+ DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 3 + yOffset);
+
+ DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av10 = DoubleVector.fromArray(
+ DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av20 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 2) + col * lda + aOffset);
+ DoubleVector av30 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 3) + col * lda + aOffset);
+
+ DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset);
+ DoubleVector av11 = DoubleVector.fromArray(
+ DSPECIES, a, row + DSPECIES.length() + (col + 1) * lda + aOffset);
+ DoubleVector av21 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 1) * lda + aOffset);
+ DoubleVector av31 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 1) * lda + aOffset);
+
+ DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset);
+ DoubleVector av12 = DoubleVector.fromArray(
+ DSPECIES, a, row + DSPECIES.length() + (col + 2) * lda + aOffset);
+ DoubleVector av22 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 2) * lda + aOffset);
+ DoubleVector av32 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 2) * lda + aOffset);
+
+ DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset);
+ DoubleVector av13 = DoubleVector.fromArray(
+ DSPECIES, a, row + DSPECIES.length() + (col + 3) * lda + aOffset);
+ DoubleVector av23 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 3) * lda + aOffset);
+ DoubleVector av33 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 3) * lda + aOffset);
+
+ av00.fma(xv0, av01.fma(xv1, av02.fma(xv2, av03.fma(xv3, yv0)))).intoArray(y, row + yOffset);
+ av10.fma(xv0, av11.fma(xv1, av12.fma(xv2, av13.fma(xv3, yv1))))
+ .intoArray(y, row + DSPECIES.length() + yOffset);
+ av20.fma(xv0, av21.fma(xv1, av22.fma(xv2, av23.fma(xv3, yv2))))
+ .intoArray(y, row + DSPECIES.length() * 2 + yOffset);
+ av30.fma(xv0, av31.fma(xv1, av32.fma(xv2, av33.fma(xv3, yv3))))
+ .intoArray(y, row + DSPECIES.length() * 3 + yOffset);
+ }
+ for (; row < rowLoopBound; row += DSPECIES.length()) {
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset);
+
+ av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))).intoArray(y, row + yOffset);
+ }
+ double x0 = alpha * x[col + xOffset];
+ double x1 = alpha * x[col + 1 + xOffset];
+ double x2 = alpha * x[col + 2 + xOffset];
+ double x3 = alpha * x[col + 3 + xOffset];
+ for (; row < m; row++) {
+ y[row + yOffset] += x0 * a[row + col * lda + aOffset]
+ + x1 * a[row + (col + 1) * lda + aOffset]
+ + x2 * a[row + (col + 2) * lda + aOffset]
+ + x3 * a[row + (col + 3) * lda + aOffset];
+ }
+ }
+ for (; col < n; col++) {
+ if (!BlasUtils.isZero(x[col + xOffset])) {
+ DoubleVector bv = DoubleVector.broadcast(DSPECIES, alpha * x[col + xOffset]);
+ int row = 0;
+ for (; row < rowUnrollLoopBound; row += DSPECIES.length() * 4) {
+ DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() + yOffset);
+ DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 2 + yOffset);
+ DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 3 + yOffset);
+
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(
+ DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 2) + col * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(
+ DSPECIES, a, (row + DSPECIES.length() * 3) + col * lda + aOffset);
+
+ av0.fma(bv, yv0).intoArray(y, row + yOffset);
+ av1.fma(bv, yv1).intoArray(y, row + DSPECIES.length() + yOffset);
+ av2.fma(bv, yv2).intoArray(y, row + DSPECIES.length() * 2 + yOffset);
+ av3.fma(bv, yv3).intoArray(y, row + DSPECIES.length() * 3 + yOffset);
+ }
+ for (; row < rowLoopBound; row += DSPECIES.length()) {
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ av.fma(bv, yv).intoArray(y, row + yOffset);
+ }
+ double alphaX = alpha * x[col + xOffset];
+ for (; row < m; row++) {
+ y[row + yOffset] += alphaX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void vecDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx, double[] y, int yOffset) {
+ int xIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ for (int col = 0; col < n; col++, xIndex += incx) {
+ if (!BlasUtils.isZero(x[xIndex + xOffset])) {
+ double alphaMulX = alpha * x[xIndex + xOffset];
+ DoubleVector alphaMulXv = DoubleVector.broadcast(DSPECIES, alphaMulX);
+ int row = 0;
+ int rowLoopBound = DSPECIES.loopBound(m);
+ for (; row < rowLoopBound; row += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector cv = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ av.fma(alphaMulXv, cv).intoArray(y, row + yOffset);
+ }
+ for (; row < m; row++) {
+ y[row + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void norDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) {
+ int xIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ for (int col = 0; col < n; col++, xIndex += incx) {
+ if (!BlasUtils.isZero(x[xIndex + xOffset])) {
+ double alphaMulX = alpha * x[xIndex + xOffset];
+ int yIndex = incy > 0 ? 0 : (m - 1) * (-incy);
+ for (int row = 0; row < m; row++, yIndex += incy) {
+ y[yIndex + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void vecDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, double[] y, int yOffset, int incy) {
+ int yIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ for (int row = 0; row < n; row++, yIndex += incy) {
+ DoubleVector cv = DoubleVector.zero(DSPECIES);
+ int col = 0;
+ int colLoopBound = DSPECIES.loopBound(m);
+ for (; col < colLoopBound; col += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset);
+ DoubleVector bv = DoubleVector.fromArray(DSPECIES, x, col + xOffset);
+ cv = av.fma(bv, cv);
+ }
+ double accum = cv.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum += a[col + row * lda + aOffset] * x[col + xOffset];
+ }
+ y[yIndex + yOffset] += alpha * accum;
+ }
+ }
+
+ private static void vecDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] x, int xOffset, double[] y, int yOffset) {
+ int row = 0;
+ int rowLoopBound = loopBound(n, 4);
+ int colUnrollLoopBound = loopBound(m, DSPECIES.length() * 4);
+ int colLoopBound = loopBound(m, DSPECIES.length());
+ for (; row < rowLoopBound; row += 4) {
+ DoubleVector yv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector yv1 = DoubleVector.zero(DSPECIES);
+ DoubleVector yv2 = DoubleVector.zero(DSPECIES);
+ DoubleVector yv3 = DoubleVector.zero(DSPECIES);
+ int col = 0;
+ for (; col < colUnrollLoopBound; col += DSPECIES.length() * 4) {
+ DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, col + xOffset);
+ DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, col + DSPECIES.length() + xOffset);
+ DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 2) + xOffset);
+ DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 3) + xOffset);
+
+ DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset);
+ DoubleVector av10 = DoubleVector.fromArray(
+ DSPECIES, a, col + DSPECIES.length() + row * lda + aOffset);
+ DoubleVector av20 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 2) + row * lda + aOffset);
+ DoubleVector av30 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 3) + row * lda + aOffset);
+ yv0 = av00.fma(xv0, av10.fma(xv1, av20.fma(xv2, av30.fma(xv3, yv0))));
+
+ DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a, col + (row + 1) * lda + aOffset);
+ DoubleVector av11 = DoubleVector.fromArray(
+ DSPECIES, a, col + DSPECIES.length() + (row + 1) * lda + aOffset);
+ DoubleVector av21 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 1) * lda + aOffset);
+ DoubleVector av31 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 1) * lda + aOffset);
+ yv1 = av01.fma(xv0, av11.fma(xv1, av21.fma(xv2, av31.fma(xv3, yv1))));
+
+ DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a, col + (row + 2) * lda + aOffset);
+ DoubleVector av12 = DoubleVector.fromArray(
+ DSPECIES, a, col + DSPECIES.length() + (row + 2) * lda + aOffset);
+ DoubleVector av22 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 2) * lda + aOffset);
+ DoubleVector av32 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 2) * lda + aOffset);
+ yv2 = av02.fma(xv0, av12.fma(xv1, av22.fma(xv2, av32.fma(xv3, yv2))));
+
+ DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a, col + (row + 3) * lda + aOffset);
+ DoubleVector av13 = DoubleVector.fromArray(
+ DSPECIES, a, col + DSPECIES.length() + (row + 3) * lda + aOffset);
+ DoubleVector av23 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 3) * lda + aOffset);
+ DoubleVector av33 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 3) * lda + aOffset);
+ yv3 = av03.fma(xv0, av13.fma(xv1, av23.fma(xv2, av33.fma(xv3, yv3))));
+ }
+ for (; col < colLoopBound; col += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, col + xOffset);
+
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, col + (row + 1) * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, col + (row + 2) * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, col + (row + 3) * lda + aOffset);
+
+ yv0 = av0.fma(xv, yv0);
+ yv1 = av1.fma(xv, yv1);
+ yv2 = av2.fma(xv, yv2);
+ yv3 = av3.fma(xv, yv3);
+ }
+ double accum0 = yv0.reduceLanes(VectorOperators.ADD);
+ double accum1 = yv1.reduceLanes(VectorOperators.ADD);
+ double accum2 = yv2.reduceLanes(VectorOperators.ADD);
+ double accum3 = yv3.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum0 += a[col + row * lda + aOffset] * x[col + xOffset];
+ accum1 += a[col + (row + 1) * lda + aOffset] * x[col + xOffset];
+ accum2 += a[col + (row + 2) * lda + aOffset] * x[col + xOffset];
+ accum3 += a[col + (row + 3) * lda + aOffset] * x[col + xOffset];
+ }
+ y[row + yOffset] += alpha * accum0;
+ y[row + 1 + yOffset] += alpha * accum1;
+ y[row + 2 + yOffset] += alpha * accum2;
+ y[row + 3 + yOffset] += alpha * accum3;
+ }
+ for (; row < n; row++) {
+ DoubleVector yv = DoubleVector.zero(DSPECIES);
+ int col = 0;
+ for (; col < colUnrollLoopBound; col += DSPECIES.length() * 4) {
+ DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, col + xOffset);
+ DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, col + DSPECIES.length() + xOffset);
+ DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 2) + xOffset);
+ DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 3) + xOffset);
+
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(
+ DSPECIES, a, col + DSPECIES.length() + row * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 2) + row * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(
+ DSPECIES, a, col + (DSPECIES.length() * 3) + row * lda + aOffset);
+
+ yv = av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv))));
+ }
+ for (; col < colLoopBound; col += DSPECIES.length()) {
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, col + xOffset);
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset);
+ yv = av.fma(xv, yv);
+ }
+ double accum = yv.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum += a[col + row * lda + aOffset] * x[col + xOffset];
+ }
+ y[row + yOffset] += alpha * accum;
+ }
+ }
+
+ private static void norDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double[] y, int yOffset, int incy) {
+ int yIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ for (int j = 0; j < n; j++, yIndex += incy) {
+ double accum = 0.0d;
+ int xIndex = incx > 0 ? 0 : (m - 1) * (-incx);
+ for (int i = 0; i < m; i++, xIndex += incx) {
+ accum += a[i + j * lda + aOffset] * x[xIndex + xOffset];
+ }
+ y[yIndex + yOffset] += alpha * accum;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java
new file mode 100644
index 0000000000000000000000000000000000000000..b79555108d2bc19af08994290cea4d4273616133
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dger {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+ private static final int UNROLL_SIZE = 4;
+
+ public static void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset,
+ int incy, double[] a, int aOffset, int lda) {
+ BlasUtils.checkParameter("DGER", 1, m >= 0);
+ BlasUtils.checkParameter("DGER", 2, n >= 0);
+ BlasUtils.checkParameter("DGER", 5, incx != 0);
+ BlasUtils.checkParameter("DGER", 7, incy != 0);
+ BlasUtils.checkParameter("DGER", 9, lda >= Math.max(1, m));
+
+ if (m == 0 || n == 0 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (m - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (m - 1) + (n - 1) * lda, a.length);
+
+ if (incx == 1 && incy == 1) {
+ vecDger(m, n, alpha, x, xOffset, y, yOffset, a, aOffset, lda);
+ } else {
+ normalDger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+ }
+
+ private static void vecDger(int m, int n, double alpha, double[] x, int xOffset, double[] y, int yOffset,
+ double[] a, int aOffset, int lda) {
+ int colLoopBound = loopBound(n, UNROLL_SIZE);
+ int rowLoopBound = loopBound(m, UNROLL_SIZE * DSPECIES.length());
+ int col = 0;
+ for (; col < colLoopBound; col += UNROLL_SIZE) {
+ DoubleVector alphaMulYv0 = DoubleVector.broadcast(DSPECIES, alpha * y[col + yOffset]);
+ DoubleVector alphaMulYv1 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 1 + yOffset]);
+ DoubleVector alphaMulYv2 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 2 + yOffset]);
+ DoubleVector alphaMulYv3 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 3 + yOffset]);
+ int row = 0;
+ for (; row < rowLoopBound; row += UNROLL_SIZE * DSPECIES.length()) {
+ DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, row + xOffset);
+ DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, row + DSPECIES.length() + xOffset);
+ DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, row + 2 * DSPECIES.length() + xOffset);
+ DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, row + 3 * DSPECIES.length() + xOffset);
+
+ DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a,
+ row + DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a,
+ row + 2 * DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a,
+ row + 3 * DSPECIES.length() + col * lda + aOffset);
+
+ xv0.fma(alphaMulYv0, av00).intoArray(a, row + col * lda + aOffset);
+ xv1.fma(alphaMulYv0, av01).intoArray(a, row + DSPECIES.length() + col * lda + aOffset);
+ xv2.fma(alphaMulYv0, av02).intoArray(a, row + 2 * DSPECIES.length() + col * lda + aOffset);
+ xv3.fma(alphaMulYv0, av03).intoArray(a, row + 3 * DSPECIES.length() + col * lda + aOffset);
+
+ DoubleVector av10 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset);
+ DoubleVector av11 = DoubleVector.fromArray(DSPECIES, a,
+ row + DSPECIES.length() + (col + 1) * lda + aOffset);
+ DoubleVector av12 = DoubleVector.fromArray(DSPECIES, a,
+ row + 2 * DSPECIES.length() + (col + 1) * lda + aOffset);
+ DoubleVector av13 = DoubleVector.fromArray(DSPECIES, a,
+ row + 3 * DSPECIES.length() + (col + 1) * lda + aOffset);
+
+ xv0.fma(alphaMulYv1, av10).intoArray(a, row + (col + 1) * lda + aOffset);
+ xv1.fma(alphaMulYv1, av11).intoArray(a, row + DSPECIES.length() + (col + 1) * lda + aOffset);
+ xv2.fma(alphaMulYv1, av12).intoArray(a, row + 2 * DSPECIES.length() + (col + 1) * lda + aOffset);
+ xv3.fma(alphaMulYv1, av13).intoArray(a, row + 3 * DSPECIES.length() + (col + 1) * lda + aOffset);
+
+ DoubleVector av20 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset);
+ DoubleVector av21 = DoubleVector.fromArray(DSPECIES, a,
+ row + DSPECIES.length() + (col + 2) * lda + aOffset);
+ DoubleVector av22 = DoubleVector.fromArray(DSPECIES, a,
+ row + 2 * DSPECIES.length() + (col + 2) * lda + aOffset);
+ DoubleVector av23 = DoubleVector.fromArray(DSPECIES, a,
+ row + 3 * DSPECIES.length() + (col + 2) * lda + aOffset);
+
+ xv0.fma(alphaMulYv2, av20).intoArray(a, row + (col + 2) * lda + aOffset);
+ xv1.fma(alphaMulYv2, av21).intoArray(a, row + DSPECIES.length() + (col + 2) * lda + aOffset);
+ xv2.fma(alphaMulYv2, av22).intoArray(a, row + 2 * DSPECIES.length() + (col + 2) * lda + aOffset);
+ xv3.fma(alphaMulYv2, av23).intoArray(a, row + 3 * DSPECIES.length() + (col + 2) * lda + aOffset);
+
+ DoubleVector av30 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset);
+ DoubleVector av31 = DoubleVector.fromArray(DSPECIES, a,
+ row + DSPECIES.length() + (col + 3) * lda + aOffset);
+ DoubleVector av32 = DoubleVector.fromArray(DSPECIES, a,
+ row + 2 * DSPECIES.length() + (col + 3) * lda + aOffset);
+ DoubleVector av33 = DoubleVector.fromArray(DSPECIES, a,
+ row + 3 * DSPECIES.length() + (col + 3) * lda + aOffset);
+
+ xv0.fma(alphaMulYv3, av30).intoArray(a, row + (col + 3) * lda + aOffset);
+ xv1.fma(alphaMulYv3, av31).intoArray(a, row + DSPECIES.length() + (col + 3) * lda + aOffset);
+ xv2.fma(alphaMulYv3, av32).intoArray(a, row + 2 * DSPECIES.length() + (col + 3) * lda + aOffset);
+ xv3.fma(alphaMulYv3, av33).intoArray(a, row + 3 * DSPECIES.length() + (col + 3) * lda + aOffset);
+ }
+ double alphaMulY0 = alpha * y[col + yOffset];
+ double alphaMulY1 = alpha * y[col + 1 + yOffset];
+ double alphaMulY2 = alpha * y[col + 2 + yOffset];
+ double alphaMulY3 = alpha * y[col + 3 + yOffset];
+ for (; row < m; row++) {
+ a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset];
+ a[row + (col + 1) * lda + aOffset] += alphaMulY1 * x[row + xOffset];
+ a[row + (col + 2) * lda + aOffset] += alphaMulY2 * x[row + xOffset];
+ a[row + (col + 3) * lda + aOffset] += alphaMulY3 * x[row + xOffset];
+ }
+ }
+ for (; col < n; col++) {
+ DoubleVector alphaMulYv = DoubleVector.broadcast(DSPECIES, alpha * y[col + yOffset]);
+ int row = 0;
+ for (; row < rowLoopBound; row += UNROLL_SIZE * DSPECIES.length()) {
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a,
+ row + 2 * DSPECIES.length() + col * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a,
+ row + 3 * DSPECIES.length() + col * lda + aOffset);
+
+ DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, row + xOffset);
+ DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, row + DSPECIES.length() + xOffset);
+ DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, row + 2 * DSPECIES.length() + xOffset);
+ DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, row + 3 * DSPECIES.length() + xOffset);
+
+ xv0.fma(alphaMulYv, av0).intoArray(a, row + col * lda + aOffset);
+ xv1.fma(alphaMulYv, av1).intoArray(a, row + DSPECIES.length() + col * lda + aOffset);
+ xv2.fma(alphaMulYv, av2).intoArray(a, row + 2 * DSPECIES.length() + col * lda + aOffset);
+ xv3.fma(alphaMulYv, av3).intoArray(a, row + 3 * DSPECIES.length() + col * lda + aOffset);
+ }
+ double alphaMulY0 = alpha * y[col + yOffset];
+ for (; row < m; row++) {
+ a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset];
+ }
+ }
+ }
+
+ private static void normalDger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y,
+ int yOffset, int incy, double[] a, int aOffset, int lda) {
+ int xStartIndx = incx > 0 ? 0 : -(m - 1) * incx;
+ int yStartIndx = incy > 0 ? 0 : -(n - 1) * incy;
+
+ for (int j = 0; j < n; j++, yStartIndx += incy) {
+ if (!BlasUtils.isZero(y[yStartIndx + yOffset])) {
+ for (int i = 0, xIndx = xStartIndx; i < m; i++, xIndx += incx) {
+ a[i + j * lda + aOffset] += alpha * x[xIndx + xOffset] * y[yStartIndx + yOffset];
+ }
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java
new file mode 100644
index 0000000000000000000000000000000000000000..b709571cfe4552e7af45ef9b151b4ee9d6bb7540
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dspmv {
+ public static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset,
+ int incx, double beta, double[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("DSPMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("DSPMV", 2, n >= 0);
+ BlasUtils.checkParameter("DSPMV", 6, incx != 0);
+ BlasUtils.checkParameter("DSPMV", 9, incy != 0);
+
+ if (n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, a.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ if (Double.compare(beta, 1.0d) != 0) {
+ DblasLevel2.dMulBeta(n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (uploFlag) {
+ if (incx == 1 && incy == 1) {
+ vecDspmvU(n, alpha, a, aOffset, x, xOffset, y, yOffset);
+ } else {
+ norDspmvU(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex);
+ }
+ } else {
+ if (incx == 1 && incy == 1) {
+ vecDspmvL(n, alpha, a, aOffset, x, xOffset, y, yOffset);
+ } else {
+ norDspmvL(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex);
+ }
+ }
+ }
+
+ private static void vecDspmvU(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, double[] y,
+ int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ double alphaMulX0 = alpha * x[xOffset + col];
+ double alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ double alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ double alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1);
+ DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2);
+ DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3);
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv1 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv2 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv3 = DoubleVector.zero(DSPECIES);
+ int row = 0;
+ for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) {
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * (col + 1) / 2);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 1) * ((col + 1) + 1) / 2);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 2) * ((col + 2) + 1) / 2);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 3) * ((col + 3) + 1) / 2);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ double accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ double accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ double accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ double a0 = a[aOffset + row + col * (col + 1) / 2];
+ double a1 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2];
+ double a2 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2];
+ double a3 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2];
+ double x0 = x[row + xOffset];
+ y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset]
+ + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]);
+ accum0 += alpha * a0 * x0;
+ accum1 += alpha * a1 * x0;
+ accum2 += alpha * a2 * x0;
+ accum3 += alpha * a3 * x0;
+ }
+ double a00 = a[aOffset + row + col * (col + 1) / 2];
+ double a01 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2];
+ double a02 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2];
+ double a03 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2];
+ double a11 = a[aOffset + (row + 1) + (col + 1) * ((col + 1) + 1) / 2];
+ double a12 = a[aOffset + (row + 1) + (col + 2) * ((col + 2) + 1) / 2];
+ double a13 = a[aOffset + (row + 1) + (col + 3) * ((col + 3) + 1) / 2];
+ double a22 = a[aOffset + (row + 2) + (col + 2) * ((col + 2) + 1) / 2];
+ double a23 = a[aOffset + (row + 2) + (col + 3) * ((col + 3) + 1) / 2];
+ double a33 = a[aOffset + (row + 3) + (col + 3) * ((col + 3) + 1) / 2];
+ y[yOffset + col] += alphaMulX0 * a00 + alphaMulX1 * a01 + alphaMulX2 * a02 + alphaMulX3 * a03 + accum0;
+ y[yOffset + (col + 1)] += alphaMulX0 * a01 + alphaMulX1 * a11 + alphaMulX2 * a12 + alphaMulX3 * a13
+ + accum1;
+ y[yOffset + (col + 2)] += alphaMulX0 * a02 + alphaMulX1 * a12 + alphaMulX2 * a22 + alphaMulX3 * a23
+ + accum2;
+ y[yOffset + (col + 3)] += alphaMulX0 * a03 + alphaMulX1 * a13 + alphaMulX2 * a23 + alphaMulX3 * a33
+ + accum3;
+ }
+ for (; col < n; col += 1) {
+ double alphaMulX0 = alpha * x[xOffset + col];
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ int row = 0;
+ for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * (col + 1) / 2);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row);
+ av.fma(alphaMulXV0, yv).intoArray(y, yOffset + row);
+ accumv0 = av.fma(xv, accumv0);
+ }
+ double accum0 = accumv0.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ double a0 = a[aOffset + row + col * (col + 1) / 2];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += a[aOffset + row + col * (col + 1) / 2] * alphaMulX0 + alpha * accum0;
+ }
+ }
+
+ private static void norDspmvU(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx,
+ double[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) {
+ int aIndx = 1;
+ for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) {
+ double alphaMulX = alpha * x[xIndx + xOffset];
+ double accum = 0.0d;
+
+ for (int row = aIndx, xi = xStartIndex, yi = yStartIndex; row < aIndx + col; row++, xi += incx,
+ yi += incy) {
+ y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset];
+ accum += a[row - 1 + aOffset] * x[xi + xOffset];
+ }
+
+ y[yIndx + yOffset] = y[yIndx + yOffset] + alphaMulX * a[aIndx + col - 1 + aOffset] + alpha * accum;
+ aIndx += col + 1;
+ }
+ }
+
+ private static void vecDspmvL(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, double[] y,
+ int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ int row = col;
+ double alphaMulX0 = alpha * x[xOffset + col];
+ double alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ double alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ double alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1);
+ DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2);
+ DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3);
+ double a00 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ double a10 = a[aOffset + (row + 1) - col * (col + 1) / 2 + n * col];
+ double a20 = a[aOffset + (row + 2) - col * (col + 1) / 2 + n * col];
+ double a30 = a[aOffset + (row + 3) - col * (col + 1) / 2 + n * col];
+ double a11 = a[aOffset + (row + 1) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ double a21 = a[aOffset + (row + 2) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ double a31 = a[aOffset + (row + 3) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ double a22 = a[aOffset + (row + 2) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ double a32 = a[aOffset + (row + 3) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ double a33 = a[aOffset + (row + 3) - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)];
+ double accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30;
+ double accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31;
+ double accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32;
+ double accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33;
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv1 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv2 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv3 = DoubleVector.zero(DSPECIES);
+ row += 4;
+ for (; row <= (n - n % DSPECIES.length() - DSPECIES.length()); row += DSPECIES.length()) {
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row - col * (col + 1) / 2 + n * col);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a,
+ aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1));
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a,
+ aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2));
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a,
+ aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3));
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < n; row += 1) {
+ double a0 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ double a1 = a[aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ double a2 = a[aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ double a3 = a[aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)];
+ y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3;
+ accum0 += alpha * x[xOffset + row] * a0;
+ accum1 += alpha * x[xOffset + row] * a1;
+ accum2 += alpha * x[xOffset + row] * a2;
+ accum3 += alpha * x[xOffset + row] * a3;
+ }
+ y[yOffset + col] += accum0;
+ y[yOffset + (col + 1)] += accum1;
+ y[yOffset + (col + 2)] += accum2;
+ y[yOffset + (col + 3)] += accum3;
+ }
+ for (; col < n; col += 1) {
+ double alphaMulX0 = alpha * x[xOffset + col];
+ y[yOffset + col] += a[aOffset + col - col * (col + 1) / 2 + n * col] * alphaMulX0;
+ int row = col + 1;
+ double accum0 = 0.0d;
+ for (; row < n; row++) {
+ double a0 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += alpha * accum0;
+ }
+ }
+
+ private static void norDspmvL(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx,
+ double[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) {
+ int aIndx = 1;
+ for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) {
+ double alphaMulX = alpha * x[xIndx + xOffset];
+ double accum = 0.0d;
+ y[yIndx + yOffset] += alphaMulX * a[aIndx - 1 + aOffset];
+
+ for (int row = aIndx + 1, xi = xIndx + incx, yi = yIndx + incy; row < aIndx + n - col; row++, xi += incx,
+ yi += incy) {
+ y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset];
+ accum += a[row - 1 + aOffset] * x[xi + xOffset];
+ }
+ y[yIndx + yOffset] += alpha * accum;
+ aIndx += n - col;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java
new file mode 100644
index 0000000000000000000000000000000000000000..662e9282f860c32a214685130f3c2ed614805e63
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import com.huawei.vectorblas.blas1.doubleprecision.Daxpy;
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Dspr {
+ public static void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap,
+ int aOffset) {
+ BlasUtils.checkParameter("DSPR", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("DSPR", 2, n >= 0);
+ BlasUtils.checkParameter("DSPR", 5, incx != 0);
+
+ if (n == 0 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, ap.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndx = incx >= 0 ? 0 : (1 - n) * incx;
+
+ int cnt = 0;
+ if (incx >= 0) {
+ for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) {
+ int colCnt = uploFlag ? j + 1 : n - j;
+ if (!BlasUtils.isZero(x[xIndx + xOffset])) {
+ int kIndx = uploFlag ? 0 : xIndx;
+ Daxpy.daxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1);
+ }
+ cnt += colCnt;
+ }
+ } else {
+ for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) {
+ int colCnt = uploFlag ? j + 1 : n - j;
+ if (!BlasUtils.isZero(x[xIndx + xOffset])) {
+ int kIndx = uploFlag ? xIndx : 0;
+ Daxpy.daxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1);
+ }
+ cnt += colCnt;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java
new file mode 100644
index 0000000000000000000000000000000000000000..d13c55e7dcba2ab451ecc6aeb0005184a2032de8
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java
@@ -0,0 +1,279 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.doubleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Dsymv {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+
+ public static void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x,
+ int xOffset, int incx, double beta, double[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("DSYMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("DSYMV", 2, n >= 0);
+ BlasUtils.checkParameter("DSYMV", 5, lda >= Math.max(1, n));
+ BlasUtils.checkParameter("DSYMV", 7, incx != 0);
+ BlasUtils.checkParameter("DSYMV", 10, incy != 0);
+
+ if (n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (n - 1) + (n - 1) * lda, a.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ if (Double.compare(beta, 1.0d) != 0) {
+ DblasLevel2.dMulBeta(n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (uploFlag) {
+ if (incx == 1 && incy == 1) {
+ vecDsymvU(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda);
+ } else {
+ norDsymvU(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex);
+ }
+ } else if (incx == 1 && incy == 1) {
+ vecDsymvL(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda);
+ } else {
+ norDsymvL(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex);
+ }
+ }
+
+ private static void vecDsymvU(int n, double[] x, int xOffset, double alpha, double[] y, int yOffset, double[] a,
+ int aOffset, int lda) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ double alphaMulX0 = alpha * x[col + xOffset];
+ double alphaMulX1 = alpha * x[(col + 1) + xOffset];
+ double alphaMulX2 = alpha * x[(col + 2) + xOffset];
+ double alphaMulX3 = alpha * x[(col + 3) + xOffset];
+ DoubleVector alphaXv0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ DoubleVector alphaXv1 = DoubleVector.broadcast(DSPECIES, alphaMulX1);
+ DoubleVector alphaXv2 = DoubleVector.broadcast(DSPECIES, alphaMulX2);
+ DoubleVector alphaXv3 = DoubleVector.broadcast(DSPECIES, alphaMulX3);
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv1 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv2 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv3 = DoubleVector.zero(DSPECIES);
+ int row = 0;
+ for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) {
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, row + xOffset);
+ yv = av0.fma(alphaXv0, yv);
+ yv = av1.fma(alphaXv1, yv);
+ yv = av2.fma(alphaXv2, yv);
+ av3.fma(alphaXv3, yv).intoArray(y, row + yOffset);
+ accumv0 = av0.fma(xv, accumv0);
+ accumv1 = av1.fma(xv, accumv1);
+ accumv2 = av2.fma(xv, accumv2);
+ accumv3 = av3.fma(xv, accumv3);
+ }
+ double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ double accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ double accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ double accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ double a0 = a[row + col * lda + aOffset];
+ double a1 = a[row + (col + 1) * lda + aOffset];
+ double a2 = a[row + (col + 2) * lda + aOffset];
+ double a3 = a[row + (col + 3) * lda + aOffset];
+ double x0 = x[row + xOffset];
+ y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset]
+ + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]);
+ accum0 += alpha * a0 * x0;
+ accum1 += alpha * a1 * x0;
+ accum2 += alpha * a2 * x0;
+ accum3 += alpha * a3 * x0;
+ }
+ double a00 = a[row + col * lda + aOffset];
+ double a01 = a[row + (col + 1) * lda + aOffset];
+ double a02 = a[row + (col + 2) * lda + aOffset];
+ double a03 = a[row + (col + 3) * lda + aOffset];
+ double a11 = a[(row + 1) + (col + 1) * lda + aOffset];
+ double a12 = a[(row + 1) + (col + 2) * lda + aOffset];
+ double a13 = a[(row + 1) + (col + 3) * lda + aOffset];
+ double a22 = a[(row + 2) + (col + 2) * lda + aOffset];
+ double a23 = a[(row + 2) + (col + 3) * lda + aOffset];
+ double a33 = a[(row + 3) + (col + 3) * lda + aOffset];
+ y[col + yOffset] += a00 * alphaMulX0 + a01 * alphaMulX1 + a02 * alphaMulX2 + a03 * alphaMulX3 + accum0;
+ y[(col + 1) + yOffset] += a01 * alphaMulX0 + a11 * alphaMulX1 + a12 * alphaMulX2 + a13 * alphaMulX3
+ + accum1;
+ y[(col + 2) + yOffset] += a02 * alphaMulX0 + a12 * alphaMulX1 + a22 * alphaMulX2 + a23 * alphaMulX3
+ + accum2;
+ y[(col + 3) + yOffset] += a03 * alphaMulX0 + a13 * alphaMulX1 + a23 * alphaMulX2 + a33 * alphaMulX3
+ + accum3;
+ }
+ for (; col < n; col++) {
+ double alphaMulX0 = alpha * x[col + xOffset];
+ DoubleVector alphaXv0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ int row = 0;
+ for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) {
+ DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, row + xOffset);
+ av.fma(alphaXv0, yv).intoArray(y, row + yOffset);
+ accumv0 = av.fma(xv, accumv0);
+ }
+ double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ double a0 = a[row + col * lda + aOffset];
+ y[row + yOffset] += a0 * alphaMulX0;
+ accum0 += alpha * a0 * x[row + xOffset];
+ }
+ y[col + yOffset] += a[row + col * lda + aOffset] * alphaMulX0 + accum0;
+ }
+ }
+
+ private static void norDsymvU(int n, double[] x, int xOffset, int incx, double alpha, double[] y, int yOffset,
+ int incy, double[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) {
+ for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) {
+ double alphaMulX = alpha * x[xj + xOffset];
+ double accum = 0.0d;
+
+ for (int row = 0, xIndx = xStartIndex, yIndx = yStartIndex; row < col; row++, xIndx += incx,
+ yIndx += incy) {
+ y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ accum += a[row + col * lda + aOffset] * x[xIndx + xOffset];
+ }
+ y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset] + alpha * accum;
+ }
+ }
+
+ private static void vecDsymvL(int n, double[] x, int xOffset, double alpha, double[] y, int yOffset, double[] a,
+ int aOffset, int lda) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ int row = col;
+ double a00 = a[aOffset + row + col * lda];
+ double a10 = a[aOffset + (row + 1) + col * lda];
+ double a20 = a[aOffset + (row + 2) + col * lda];
+ double a30 = a[aOffset + (row + 3) + col * lda];
+ double a11 = a[aOffset + (row + 1) + (col + 1) * lda];
+ double a21 = a[aOffset + (row + 2) + (col + 1) * lda];
+ double a31 = a[aOffset + (row + 3) + (col + 1) * lda];
+ double a22 = a[aOffset + (row + 2) + (col + 2) * lda];
+ double a32 = a[aOffset + (row + 3) + (col + 2) * lda];
+ double a33 = a[aOffset + (row + 3) + (col + 3) * lda];
+ double alphaMulX0 = alpha * x[xOffset + col];
+ double alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ double alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ double alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ double accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30;
+ double accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31;
+ double accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32;
+ double accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33;
+ DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0);
+ DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1);
+ DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2);
+ DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3);
+ DoubleVector accumv0 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv1 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv2 = DoubleVector.zero(DSPECIES);
+ DoubleVector accumv3 = DoubleVector.zero(DSPECIES);
+ row += 4;
+ for (; row <= (n - n % DSPECIES.length() - DSPECIES.length()); row += DSPECIES.length()) {
+ DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * lda);
+ DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 1) * lda);
+ DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 2) * lda);
+ DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 3) * lda);
+ DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row);
+ DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < n; row += 1) {
+ double a0 = a[aOffset + row + col * lda];
+ double a1 = a[aOffset + row + (col + 1) * lda];
+ double a2 = a[aOffset + row + (col + 2) * lda];
+ double a3 = a[aOffset + row + (col + 3) * lda];
+ y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3;
+ accum0 += alpha * x[xOffset + row] * a0;
+ accum1 += alpha * x[xOffset + row] * a1;
+ accum2 += alpha * x[xOffset + row] * a2;
+ accum3 += alpha * x[xOffset + row] * a3;
+ }
+ y[yOffset + col] += accum0;
+ y[yOffset + (col + 1)] += accum1;
+ y[yOffset + (col + 2)] += accum2;
+ y[yOffset + (col + 3)] += accum3;
+ }
+ for (; col < n; col += 1) {
+ double alphaMulX0 = alpha * x[xOffset + col];
+ y[yOffset + col] += a[aOffset + col + col * lda] * alphaMulX0;
+ int row = col + 1;
+ double accum0 = 0.0d;
+ for (; row < n; row++) {
+ double a0 = a[aOffset + row + col * lda];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += alpha * accum0;
+ }
+ }
+
+ private static void norDsymvL(int n, double[] x, int xOffset, int incx, double alpha, double[] y, int yOffset,
+ int incy, double[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) {
+ for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) {
+ double alphaMulX = alpha * x[xj + xOffset];
+ y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset];
+ double accum = 0.0d;
+
+ for (int row = col + 1, xIndx = xj + incx, yIndx = yj + incy; row < n; row++, xIndx += incx,
+ yIndx += incy) {
+ y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ accum += a[row + col * lda + aOffset] * x[xIndx + xOffset];
+ }
+ y[yj + yOffset] += alpha * accum;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java
new file mode 100644
index 0000000000000000000000000000000000000000..6f1cf5f1e4495fb1a262c4cf6446cafd04177e06
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class SblasLevel2 {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ protected static void sMulBeta(int size, float beta, float[] sy, int yOffset, int incy) {
+ if (incy == 1) {
+ FloatVector betaVec = FloatVector.broadcast(SSPECIES, beta);
+ int idx = 0;
+ for (; idx < SSPECIES.loopBound(size); idx += SSPECIES.length()) {
+ FloatVector yv = FloatVector.fromArray(SSPECIES, sy, idx + yOffset);
+ betaVec.mul(yv).intoArray(sy, idx + yOffset);
+ }
+ for (; idx < size; idx++) {
+ sy[idx + yOffset] = beta * sy[idx + yOffset];
+ }
+ } else {
+ int yIndex = incy >= 0 ? 0 : (1 - size) * incy;
+ if (BlasUtils.isZero(beta)) {
+ for (int i = 0; i < size; i++, yIndex += incy) {
+ sy[yIndex + yOffset] = 0.0f;
+ }
+ } else {
+ for (int i = 0; i < size; i++, yIndex += incy) {
+ sy[yIndex + yOffset] = beta * sy[yIndex + yOffset];
+ }
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java
new file mode 100644
index 0000000000000000000000000000000000000000..a3b8cdc3965025ac118dfce27aa54bc9e11ae869
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java
@@ -0,0 +1,377 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sgemv {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x,
+ int xOffset, int incx, float beta, float[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("SGEMV", 1, Lsame.lsame(trans, "N") || Lsame.lsame(trans, "T"));
+ BlasUtils.checkParameter("SGEMV", 2, m >= 0);
+ BlasUtils.checkParameter("SGEMV", 3, n >= 0);
+ BlasUtils.checkParameter("SGEMV", 6, lda >= Math.max(1, m));
+ BlasUtils.checkParameter("SGEMV", 8, incx != 0);
+ BlasUtils.checkParameter("SGEMV", 11, incy != 0);
+ if (m == 0 || n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) {
+ return;
+ }
+ boolean transFlag = Lsame.lsame(trans, "N");
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * ((transFlag ? n : m) - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * ((transFlag ? m : n) - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (n - 1) * lda + m - 1, a.length);
+
+ if (Float.compare(beta, 1.0f) != 0) {
+ SblasLevel2.sMulBeta(transFlag ? m : n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (transFlag) {
+ if (incy == 1) {
+ if (incx == 1) {
+ vecSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset);
+ } else {
+ vecSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset);
+ }
+ } else {
+ norSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy);
+ }
+ } else {
+ if (incx == 1) {
+ if (incy == 1) {
+ vecSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset);
+ } else {
+ vecSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset, incy);
+ }
+ } else {
+ norSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy);
+ }
+ }
+ }
+
+ private static void vecSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, float[] y, int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ int rowUnrollLoopBound = loopBound(m, SSPECIES.length() * 4);
+ int rowLoopBound = loopBound(m, SSPECIES.length());
+ for (; col < colLoopBound; col += 4) {
+ FloatVector xv0 = FloatVector.broadcast(SSPECIES, alpha * x[col + xOffset]);
+ FloatVector xv1 = FloatVector.broadcast(SSPECIES, alpha * x[col + 1 + xOffset]);
+ FloatVector xv2 = FloatVector.broadcast(SSPECIES, alpha * x[col + 2 + xOffset]);
+ FloatVector xv3 = FloatVector.broadcast(SSPECIES, alpha * x[col + 3 + xOffset]);
+ int row = 0;
+ for (; row < rowUnrollLoopBound; row += SSPECIES.length() * 4) {
+ FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() + yOffset);
+ FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 2 + yOffset);
+ FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 3 + yOffset);
+
+ FloatVector av00 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av10 = FloatVector.fromArray(
+ SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset);
+ FloatVector av20 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 2) + col * lda + aOffset);
+ FloatVector av30 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 3) + col * lda + aOffset);
+
+ FloatVector av01 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset);
+ FloatVector av11 = FloatVector.fromArray(
+ SSPECIES, a, row + SSPECIES.length() + (col + 1) * lda + aOffset);
+ FloatVector av21 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 1) * lda + aOffset);
+ FloatVector av31 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 1) * lda + aOffset);
+
+ FloatVector av02 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset);
+ FloatVector av12 = FloatVector.fromArray(
+ SSPECIES, a, row + SSPECIES.length() + (col + 2) * lda + aOffset);
+ FloatVector av22 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 2) * lda + aOffset);
+ FloatVector av32 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 2) * lda + aOffset);
+
+ FloatVector av03 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset);
+ FloatVector av13 = FloatVector.fromArray(
+ SSPECIES, a, row + SSPECIES.length() + (col + 3) * lda + aOffset);
+ FloatVector av23 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 3) * lda + aOffset);
+ FloatVector av33 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 3) * lda + aOffset);
+
+ av00.fma(xv0, av01.fma(xv1, av02.fma(xv2, av03.fma(xv3, yv0)))).intoArray(y, row + yOffset);
+ av10.fma(xv0, av11.fma(xv1, av12.fma(xv2, av13.fma(xv3, yv1))))
+ .intoArray(y, row + SSPECIES.length() + yOffset);
+ av20.fma(xv0, av21.fma(xv1, av22.fma(xv2, av23.fma(xv3, yv2))))
+ .intoArray(y, row + SSPECIES.length() * 2 + yOffset);
+ av30.fma(xv0, av31.fma(xv1, av32.fma(xv2, av33.fma(xv3, yv3))))
+ .intoArray(y, row + SSPECIES.length() * 3 + yOffset);
+ }
+ for (; row < rowLoopBound; row += SSPECIES.length()) {
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset);
+
+ av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))).intoArray(y, row + yOffset);
+ }
+ float x0 = alpha * x[col + xOffset];
+ float x1 = alpha * x[col + 1 + xOffset];
+ float x2 = alpha * x[col + 2 + xOffset];
+ float x3 = alpha * x[col + 3 + xOffset];
+ for (; row < m; row++) {
+ y[row + yOffset] += x0 * a[row + col * lda + aOffset]
+ + x1 * a[row + (col + 1) * lda + aOffset]
+ + x2 * a[row + (col + 2) * lda + aOffset]
+ + x3 * a[row + (col + 3) * lda + aOffset];
+ }
+ }
+ for (; col < n; col++) {
+ if (!BlasUtils.isZero(x[col + xOffset])) {
+ FloatVector bv = FloatVector.broadcast(SSPECIES, alpha * x[col + xOffset]);
+ int row = 0;
+ for (; row < rowUnrollLoopBound; row += SSPECIES.length() * 4) {
+ FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() + yOffset);
+ FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 2 + yOffset);
+ FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 3 + yOffset);
+
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(
+ SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 2) + col * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(
+ SSPECIES, a, (row + SSPECIES.length() * 3) + col * lda + aOffset);
+
+ av0.fma(bv, yv0).intoArray(y, row + yOffset);
+ av1.fma(bv, yv1).intoArray(y, row + SSPECIES.length() + yOffset);
+ av2.fma(bv, yv2).intoArray(y, row + SSPECIES.length() * 2 + yOffset);
+ av3.fma(bv, yv3).intoArray(y, row + SSPECIES.length() * 3 + yOffset);
+ }
+ for (; row < rowLoopBound; row += SSPECIES.length()) {
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ bv.fma(av, yv).intoArray(y, row + yOffset);
+ }
+ float alphaX = alpha * x[col + xOffset];
+ for (; row < m; row++) {
+ y[row + yOffset] += alphaX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void vecSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx, float[] y, int yOffset) {
+ int xIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ int rowLoopBound = SSPECIES.loopBound(m);
+ for (int col = 0; col < n; col++, xIndex += incx) {
+ if (!BlasUtils.isZero(x[xIndex + xOffset])) {
+ float alphaMulX = alpha * x[xIndex + xOffset];
+ FloatVector alphaMulXv = FloatVector.broadcast(SSPECIES, alphaMulX);
+ int row = 0;
+ for (; row < rowLoopBound; row += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector cv = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ av.fma(alphaMulXv, cv).intoArray(y, row + yOffset);
+ }
+ for (; row < m; row++) {
+ y[row + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void norSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) {
+ int xIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ for (int col = 0; col < n; col++, xIndex += incx) {
+ if (!BlasUtils.isZero(x[xIndex + xOffset])) {
+ float alphaMulX = alpha * x[xIndex + xOffset];
+ int yIndex = incy > 0 ? 0 : (m - 1) * (-incy);
+ for (int row = 0; row < m; row++, yIndex += incy) {
+ y[yIndex + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ }
+ }
+ }
+ }
+
+ private static void vecSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ float[] y, int yOffset, int incy) {
+ int yIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ int colLoopBound = SSPECIES.loopBound(m);
+ for (int row = 0; row < n; row++, yIndex += incy) {
+ FloatVector cv = FloatVector.zero(SSPECIES);
+ int col = 0;
+ for (; col < colLoopBound; col += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset);
+ FloatVector bv = FloatVector.fromArray(SSPECIES, x, col + xOffset);
+ cv = av.fma(bv, cv);
+ }
+ float accum = cv.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum += a[col + row * lda + aOffset] * x[col + xOffset];
+ }
+ y[yIndex + yOffset] += alpha * accum;
+ }
+ }
+
+ private static void vecSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] x, int xOffset, float[] y, int yOffset) {
+ int row = 0;
+ int rowLoopBound = loopBound(n, 4);
+ int colUnrollLoopBound = loopBound(m, SSPECIES.length() * 4);
+ int colLoopBound = loopBound(m, SSPECIES.length());
+ for (; row < rowLoopBound; row += 4) {
+ FloatVector yv0 = FloatVector.zero(SSPECIES);
+ FloatVector yv1 = FloatVector.zero(SSPECIES);
+ FloatVector yv2 = FloatVector.zero(SSPECIES);
+ FloatVector yv3 = FloatVector.zero(SSPECIES);
+ int col = 0;
+ for (; col < colUnrollLoopBound; col += SSPECIES.length() * 4) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, col + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, col + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 2) + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 3) + xOffset);
+
+ FloatVector av00 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset);
+ FloatVector av10 = FloatVector.fromArray(
+ SSPECIES, a, col + SSPECIES.length() + row * lda + aOffset);
+ FloatVector av20 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 2) + row * lda + aOffset);
+ FloatVector av30 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 3) + row * lda + aOffset);
+ yv0 = av00.fma(xv0, av10.fma(xv1, av20.fma(xv2, av30.fma(xv3, yv0))));
+
+ FloatVector av01 = FloatVector.fromArray(SSPECIES, a, col + (row + 1) * lda + aOffset);
+ FloatVector av11 = FloatVector.fromArray(
+ SSPECIES, a, col + SSPECIES.length() + (row + 1) * lda + aOffset);
+ FloatVector av21 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 1) * lda + aOffset);
+ FloatVector av31 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 1) * lda + aOffset);
+ yv1 = av01.fma(xv0, av11.fma(xv1, av21.fma(xv2, av31.fma(xv3, yv1))));
+
+ FloatVector av02 = FloatVector.fromArray(SSPECIES, a, col + (row + 2) * lda + aOffset);
+ FloatVector av12 = FloatVector.fromArray(
+ SSPECIES, a, col + SSPECIES.length() + (row + 2) * lda + aOffset);
+ FloatVector av22 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 2) * lda + aOffset);
+ FloatVector av32 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 2) * lda + aOffset);
+ yv2 = av02.fma(xv0, av12.fma(xv1, av22.fma(xv2, av32.fma(xv3, yv2))));
+
+ FloatVector av03 = FloatVector.fromArray(SSPECIES, a, col + (row + 3) * lda + aOffset);
+ FloatVector av13 = FloatVector.fromArray(
+ SSPECIES, a, col + SSPECIES.length() + (row + 3) * lda + aOffset);
+ FloatVector av23 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 3) * lda + aOffset);
+ FloatVector av33 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 3) * lda + aOffset);
+ yv3 = av03.fma(xv0, av13.fma(xv1, av23.fma(xv2, av33.fma(xv3, yv3))));
+ }
+ for (; col < colLoopBound; col += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, col + xOffset);
+
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, col + (row + 1) * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, col + (row + 2) * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, col + (row + 3) * lda + aOffset);
+
+ yv0 = av0.fma(xv, yv0);
+ yv1 = av1.fma(xv, yv1);
+ yv2 = av2.fma(xv, yv2);
+ yv3 = av3.fma(xv, yv3);
+ }
+ float accum0 = yv0.reduceLanes(VectorOperators.ADD);
+ float accum1 = yv1.reduceLanes(VectorOperators.ADD);
+ float accum2 = yv2.reduceLanes(VectorOperators.ADD);
+ float accum3 = yv3.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum0 += a[col + row * lda + aOffset] * x[col + xOffset];
+ accum1 += a[col + (row + 1) * lda + aOffset] * x[col + xOffset];
+ accum2 += a[col + (row + 2) * lda + aOffset] * x[col + xOffset];
+ accum3 += a[col + (row + 3) * lda + aOffset] * x[col + xOffset];
+ }
+ y[row + yOffset] += alpha * accum0;
+ y[row + 1 + yOffset] += alpha * accum1;
+ y[row + 2 + yOffset] += alpha * accum2;
+ y[row + 3 + yOffset] += alpha * accum3;
+ }
+ for (; row < n; row++) {
+ FloatVector yv = FloatVector.zero(SSPECIES);
+ int col = 0;
+ for (; col < colUnrollLoopBound; col += SSPECIES.length() * 4) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, col + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, col + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 2) + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 3) + xOffset);
+
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, col + SSPECIES.length() + row * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 2) + row * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(
+ SSPECIES, a, col + (SSPECIES.length() * 3) + row * lda + aOffset);
+
+ yv = av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv))));
+ }
+ for (; col < colLoopBound; col += SSPECIES.length()) {
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, col + xOffset);
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset);
+ yv = xv.fma(av, yv);
+ }
+ float accum = yv.reduceLanes(VectorOperators.ADD);
+ for (; col < m; col++) {
+ accum += x[col + xOffset] * a[col + row * lda + aOffset];
+ }
+ y[row + yOffset] += alpha * accum;
+ }
+ }
+
+ private static void norSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ int incx, float[] y, int yOffset, int incy) {
+ int yIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ for (int row = 0; row < n; row++, yIndex += incy) {
+ float accum = 0.0f;
+ int xIndex = incx > 0 ? 0 : (m - 1) * (-incx);
+ for (int col = 0; col < m; col++, xIndex += incx) {
+ accum += a[col + row * lda + aOffset] * x[xIndex + xOffset];
+ }
+ y[yIndex + yOffset] += alpha * accum;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java
new file mode 100644
index 0000000000000000000000000000000000000000..1af0afc40d7747f8407a53a7f7ceb04781b2f359
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sger {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+ private static final int UNROLL_SIZE = 4;
+
+ public static void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda) {
+ BlasUtils.checkParameter("SGER", 1, m >= 0);
+ BlasUtils.checkParameter("SGER", 2, n >= 0);
+ BlasUtils.checkParameter("SGER", 5, incx != 0);
+ BlasUtils.checkParameter("SGER", 7, incy != 0);
+ BlasUtils.checkParameter("SGER", 9, lda >= Math.max(1, m));
+
+ if (m == 0 || n == 0 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (m - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (m - 1) + (n - 1) * lda, a.length);
+
+ if (incx == 1 && incy == 1) {
+ vecSger(m, n, alpha, x, xOffset, y, yOffset, a, aOffset, lda);
+ } else {
+ normalSger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda);
+ }
+ }
+
+ private static void vecSger(int m, int n, float alpha, float[] x, int xOffset, float[] y, int yOffset, float[] a,
+ int aOffset, int lda) {
+ int colLoopBound = loopBound(n, UNROLL_SIZE);
+ int rowLoopBound = loopBound(m, UNROLL_SIZE * SSPECIES.length());
+ int col = 0;
+ for (; col < colLoopBound; col += UNROLL_SIZE) {
+ FloatVector alphaMulYv0 = FloatVector.broadcast(SSPECIES, alpha * y[col + yOffset]);
+ FloatVector alphaMulYv1 = FloatVector.broadcast(SSPECIES, alpha * y[col + 1 + yOffset]);
+ FloatVector alphaMulYv2 = FloatVector.broadcast(SSPECIES, alpha * y[col + 2 + yOffset]);
+ FloatVector alphaMulYv3 = FloatVector.broadcast(SSPECIES, alpha * y[col + 3 + yOffset]);
+ int row = 0;
+ for (; row < rowLoopBound; row += UNROLL_SIZE * SSPECIES.length()) {
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, row + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, row + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, row + 2 * SSPECIES.length() + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, row + 3 * SSPECIES.length() + xOffset);
+
+ FloatVector av00 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av01 = FloatVector.fromArray(SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset);
+ FloatVector av02 = FloatVector.fromArray(SSPECIES, a,
+ row + 2 * SSPECIES.length() + col * lda + aOffset);
+ FloatVector av03 = FloatVector.fromArray(SSPECIES, a,
+ row + 3 * SSPECIES.length() + col * lda + aOffset);
+
+ xv0.fma(alphaMulYv0, av00).intoArray(a, row + col * lda + aOffset);
+ xv1.fma(alphaMulYv0, av01).intoArray(a, row + SSPECIES.length() + col * lda + aOffset);
+ xv2.fma(alphaMulYv0, av02).intoArray(a, row + 2 * SSPECIES.length() + col * lda + aOffset);
+ xv3.fma(alphaMulYv0, av03).intoArray(a, row + 3 * SSPECIES.length() + col * lda + aOffset);
+
+ FloatVector av10 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset);
+ FloatVector av11 = FloatVector.fromArray(SSPECIES, a,
+ row + SSPECIES.length() + (col + 1) * lda + aOffset);
+ FloatVector av12 = FloatVector.fromArray(SSPECIES, a,
+ row + 2 * SSPECIES.length() + (col + 1) * lda + aOffset);
+ FloatVector av13 = FloatVector.fromArray(SSPECIES, a,
+ row + 3 * SSPECIES.length() + (col + 1) * lda + aOffset);
+
+ xv0.fma(alphaMulYv1, av10).intoArray(a, row + (col + 1) * lda + aOffset);
+ xv1.fma(alphaMulYv1, av11).intoArray(a, row + SSPECIES.length() + (col + 1) * lda + aOffset);
+ xv2.fma(alphaMulYv1, av12).intoArray(a, row + 2 * SSPECIES.length() + (col + 1) * lda + aOffset);
+ xv3.fma(alphaMulYv1, av13).intoArray(a, row + 3 * SSPECIES.length() + (col + 1) * lda + aOffset);
+
+ FloatVector av20 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset);
+ FloatVector av21 = FloatVector.fromArray(SSPECIES, a,
+ row + SSPECIES.length() + (col + 2) * lda + aOffset);
+ FloatVector av22 = FloatVector.fromArray(SSPECIES, a,
+ row + 2 * SSPECIES.length() + (col + 2) * lda + aOffset);
+ FloatVector av23 = FloatVector.fromArray(SSPECIES, a,
+ row + 3 * SSPECIES.length() + (col + 2) * lda + aOffset);
+
+ xv0.fma(alphaMulYv2, av20).intoArray(a, row + (col + 2) * lda + aOffset);
+ xv1.fma(alphaMulYv2, av21).intoArray(a, row + SSPECIES.length() + (col + 2) * lda + aOffset);
+ xv2.fma(alphaMulYv2, av22).intoArray(a, row + 2 * SSPECIES.length() + (col + 2) * lda + aOffset);
+ xv3.fma(alphaMulYv2, av23).intoArray(a, row + 3 * SSPECIES.length() + (col + 2) * lda + aOffset);
+
+ FloatVector av30 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset);
+ FloatVector av31 = FloatVector.fromArray(SSPECIES, a,
+ row + SSPECIES.length() + (col + 3) * lda + aOffset);
+ FloatVector av32 = FloatVector.fromArray(SSPECIES, a,
+ row + 2 * SSPECIES.length() + (col + 3) * lda + aOffset);
+ FloatVector av33 = FloatVector.fromArray(SSPECIES, a,
+ row + 3 * SSPECIES.length() + (col + 3) * lda + aOffset);
+
+ xv0.fma(alphaMulYv3, av30).intoArray(a, row + (col + 3) * lda + aOffset);
+ xv1.fma(alphaMulYv3, av31).intoArray(a, row + SSPECIES.length() + (col + 3) * lda + aOffset);
+ xv2.fma(alphaMulYv3, av32).intoArray(a, row + 2 * SSPECIES.length() + (col + 3) * lda + aOffset);
+ xv3.fma(alphaMulYv3, av33).intoArray(a, row + 3 * SSPECIES.length() + (col + 3) * lda + aOffset);
+ }
+ float alphaMulY0 = alpha * y[col + yOffset];
+ float alphaMulY1 = alpha * y[col + 1 + yOffset];
+ float alphaMulY2 = alpha * y[col + 2 + yOffset];
+ float alphaMulY3 = alpha * y[col + 3 + yOffset];
+ for (; row < m; row++) {
+ a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset];
+ a[row + (col + 1) * lda + aOffset] += alphaMulY1 * x[row + xOffset];
+ a[row + (col + 2) * lda + aOffset] += alphaMulY2 * x[row + xOffset];
+ a[row + (col + 3) * lda + aOffset] += alphaMulY3 * x[row + xOffset];
+ }
+ }
+ for (; col < n; col++) {
+ int row;
+ FloatVector alphaMulYv = FloatVector.broadcast(SSPECIES, alpha * y[col + yOffset]);
+ for (row = 0; row < rowLoopBound; row += UNROLL_SIZE * SSPECIES.length()) {
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + 2 * SSPECIES.length() + col * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + 3 * SSPECIES.length() + col * lda + aOffset);
+
+ FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, row + xOffset);
+ FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, row + SSPECIES.length() + xOffset);
+ FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, row + 2 * SSPECIES.length() + xOffset);
+ FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, row + 3 * SSPECIES.length() + xOffset);
+
+ xv0.fma(alphaMulYv, av0).intoArray(a, row + col * lda + aOffset);
+ xv1.fma(alphaMulYv, av1).intoArray(a, row + SSPECIES.length() + col * lda + aOffset);
+ xv2.fma(alphaMulYv, av2).intoArray(a, row + 2 * SSPECIES.length() + col * lda + aOffset);
+ xv3.fma(alphaMulYv, av3).intoArray(a, row + 3 * SSPECIES.length() + col * lda + aOffset);
+ }
+ float alphaMulY0 = alpha * y[col + yOffset];
+ for (; row < m; row++) {
+ a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset];
+ }
+ }
+ }
+
+ private static void normalSger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda) {
+ int xStartIndx = incx > 0 ? 0 : -(m - 1) * incx;
+ int yStartIndx = incy > 0 ? 0 : -(n - 1) * incy;
+
+ for (int j = 0; j < n; j++, yStartIndx += incy) {
+ if (!BlasUtils.isZero(y[yStartIndx + yOffset])) {
+ for (int i = 0, xIndx = xStartIndx; i < m; i++, xIndx += incx) {
+ a[i + j * lda + aOffset] += alpha * x[xIndx + xOffset] * y[yStartIndx + yOffset];
+ }
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java
new file mode 100644
index 0000000000000000000000000000000000000000..1394b92d3d2b8331e6773bbc4484e493bc9ad404
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Sspmv {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx,
+ float beta, float[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("SSPMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("SSPMV", 2, n >= 0);
+ BlasUtils.checkParameter("SSPMV", 6, incx != 0);
+ BlasUtils.checkParameter("SSPMV", 9, incy != 0);
+
+ if (n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, a.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ if (Float.compare(beta, 1.0f) != 0) {
+ SblasLevel2.sMulBeta(n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (uploFlag) {
+ if (incx == 1 && incy == 1) {
+ vecSspmvU(n, alpha, a, aOffset, x, xOffset, y, yOffset);
+ } else {
+ norSspmvU(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex);
+ }
+ } else {
+ if (incx == 1 && incy == 1) {
+ vecSspmvL(n, alpha, a, aOffset, x, xOffset, y, yOffset);
+ } else {
+ norSspmvL(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex);
+ }
+ }
+ }
+
+ private static void vecSspmvU(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, float[] y,
+ int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ float alphaMulX0 = alpha * x[xOffset + col];
+ float alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ float alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ float alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1);
+ FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2);
+ FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3);
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ FloatVector accumv1 = FloatVector.zero(SSPECIES);
+ FloatVector accumv2 = FloatVector.zero(SSPECIES);
+ FloatVector accumv3 = FloatVector.zero(SSPECIES);
+ int row = 0;
+ for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) {
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * (col + 1) / 2);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 1) * ((col + 1) + 1) / 2);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 2) * ((col + 2) + 1) / 2);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 3) * ((col + 3) + 1) / 2);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ float accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ float accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ float accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ float a0 = a[aOffset + row + col * (col + 1) / 2];
+ float a1 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2];
+ float a2 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2];
+ float a3 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2];
+ float x0 = x[row + xOffset];
+ y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset]
+ + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]);
+ accum0 += alpha * a0 * x0;
+ accum1 += alpha * a1 * x0;
+ accum2 += alpha * a2 * x0;
+ accum3 += alpha * a3 * x0;
+ }
+ float a00 = a[aOffset + row + col * (col + 1) / 2];
+ float a01 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2];
+ float a02 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2];
+ float a03 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2];
+ float a11 = a[aOffset + (row + 1) + (col + 1) * ((col + 1) + 1) / 2];
+ float a12 = a[aOffset + (row + 1) + (col + 2) * ((col + 2) + 1) / 2];
+ float a13 = a[aOffset + (row + 1) + (col + 3) * ((col + 3) + 1) / 2];
+ float a22 = a[aOffset + (row + 2) + (col + 2) * ((col + 2) + 1) / 2];
+ float a23 = a[aOffset + (row + 2) + (col + 3) * ((col + 3) + 1) / 2];
+ float a33 = a[aOffset + (row + 3) + (col + 3) * ((col + 3) + 1) / 2];
+ y[yOffset + col] += alphaMulX0 * a00 + alphaMulX1 * a01 + alphaMulX2 * a02 + alphaMulX3 * a03 + accum0;
+ y[yOffset + (col + 1)] += alphaMulX0 * a01 + alphaMulX1 * a11 + alphaMulX2 * a12 + alphaMulX3 * a13
+ + accum1;
+ y[yOffset + (col + 2)] += alphaMulX0 * a02 + alphaMulX1 * a12 + alphaMulX2 * a22 + alphaMulX3 * a23
+ + accum2;
+ y[yOffset + (col + 3)] += alphaMulX0 * a03 + alphaMulX1 * a13 + alphaMulX2 * a23 + alphaMulX3 * a33
+ + accum3;
+ }
+ for (; col < n; col += 1) {
+ float alphaMulX0 = alpha * x[xOffset + col];
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ int row = 0;
+ for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * (col + 1) / 2);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row);
+ av.fma(alphaMulXV0, yv).intoArray(y, yOffset + row);
+ accumv0 = av.fma(xv, accumv0);
+ }
+ float accum0 = accumv0.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ float a0 = a[aOffset + row + col * (col + 1) / 2];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += a[aOffset + row + col * (col + 1) / 2] * alphaMulX0 + alpha * accum0;
+ }
+ }
+
+ private static void norSspmvU(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx,
+ float[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) {
+ int aIndx = 1;
+ for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) {
+ float alphaMulX = alpha * x[xIndx + xOffset];
+ float accum = 0.0f;
+
+ for (int row = aIndx, xi = xStartIndex, yi = yStartIndex; row < aIndx + col; row++, xi += incx,
+ yi += incy) {
+ y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset];
+ accum += a[row - 1 + aOffset] * x[xi + xOffset];
+ }
+
+ y[yIndx + yOffset] = y[yIndx + yOffset] + alphaMulX * a[aIndx + col - 1 + aOffset] + alpha * accum;
+ aIndx += col + 1;
+ }
+ }
+
+ private static void vecSspmvL(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, float[] y,
+ int yOffset) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ int row = col;
+ float alphaMulX0 = alpha * x[xOffset + col];
+ float alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ float alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ float alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1);
+ FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2);
+ FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3);
+ float a00 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ float a10 = a[aOffset + (row + 1) - col * (col + 1) / 2 + n * col];
+ float a20 = a[aOffset + (row + 2) - col * (col + 1) / 2 + n * col];
+ float a30 = a[aOffset + (row + 3) - col * (col + 1) / 2 + n * col];
+ float a11 = a[aOffset + (row + 1) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ float a21 = a[aOffset + (row + 2) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ float a31 = a[aOffset + (row + 3) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ float a22 = a[aOffset + (row + 2) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ float a32 = a[aOffset + (row + 3) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ float a33 = a[aOffset + (row + 3) - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)];
+ float accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30;
+ float accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31;
+ float accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32;
+ float accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33;
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ FloatVector accumv1 = FloatVector.zero(SSPECIES);
+ FloatVector accumv2 = FloatVector.zero(SSPECIES);
+ FloatVector accumv3 = FloatVector.zero(SSPECIES);
+ row += 4;
+ for (; row <= (n - n % SSPECIES.length() - SSPECIES.length()); row += SSPECIES.length()) {
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row - col * (col + 1) / 2 + n * col);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a,
+ aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1));
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a,
+ aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2));
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a,
+ aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3));
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < n; row += 1) {
+ float a0 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ float a1 = a[aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)];
+ float a2 = a[aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)];
+ float a3 = a[aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)];
+ y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3;
+ accum0 += alpha * x[xOffset + row] * a0;
+ accum1 += alpha * x[xOffset + row] * a1;
+ accum2 += alpha * x[xOffset + row] * a2;
+ accum3 += alpha * x[xOffset + row] * a3;
+ }
+ y[yOffset + col] += accum0;
+ y[yOffset + (col + 1)] += accum1;
+ y[yOffset + (col + 2)] += accum2;
+ y[yOffset + (col + 3)] += accum3;
+ }
+ for (; col < n; col += 1) {
+ float alphaMulX0 = alpha * x[xOffset + col];
+ y[yOffset + col] += a[aOffset + col - col * (col + 1) / 2 + n * col] * alphaMulX0;
+ int row = col + 1;
+ float accum0 = 0.0f;
+ for (; row < n; row++) {
+ float a0 = a[aOffset + row - col * (col + 1) / 2 + n * col];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += alpha * accum0;
+ }
+ }
+
+ private static void norSspmvL(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx,
+ float[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) {
+ int aIndx = 1;
+ for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) {
+ float alphaMulX = alpha * x[xIndx + xOffset];
+ float accum = 0.0f;
+ y[yIndx + yOffset] += alphaMulX * a[aIndx - 1 + aOffset];
+
+ for (int row = aIndx + 1, xi = xIndx + incx, yi = yIndx + incy; row < aIndx + n - col; row++, xi += incx,
+ yi += incy) {
+ y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset];
+ accum += a[row - 1 + aOffset] * x[xi + xOffset];
+ }
+ y[yIndx + yOffset] += alpha * accum;
+ aIndx += n - col;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java
new file mode 100644
index 0000000000000000000000000000000000000000..b229bbeb4a4faaab077d21ab4952c2a04fbe6efd
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import com.huawei.vectorblas.blas1.singleprecision.Saxpy;
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Sspr {
+ public static void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap,
+ int aOffset) {
+ BlasUtils.checkParameter("SSPR", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("SSPR", 2, n >= 0);
+ BlasUtils.checkParameter("SSPR", 5, incx != 0);
+
+ if (n == 0 || BlasUtils.isZero(alpha)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, ap.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndx = 0;
+ if (incx <= 0) {
+ xStartIndx = -(n - 1) * incx;
+ }
+
+ int cnt = 0;
+ if (incx >= 0) {
+ for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) {
+ int colCnt = uploFlag ? j + 1 : n - j;
+ if (!BlasUtils.isZero(x[xIndx + xOffset])) {
+ int kIndx = uploFlag ? 0 : xIndx;
+ Saxpy.saxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1);
+ }
+ cnt += colCnt;
+ }
+ } else {
+ for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) {
+ int colCnt = uploFlag ? j + 1 : n - j;
+ if (!BlasUtils.isZero(x[xIndx + xOffset])) {
+ int kIndx = uploFlag ? xIndx : 0;
+ Saxpy.saxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1);
+ }
+ cnt += colCnt;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java
new file mode 100644
index 0000000000000000000000000000000000000000..2af10f8ce7d6f4f0fcdd59ceb15e86a6056aa17a
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java
@@ -0,0 +1,279 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas2.singleprecision;
+
+import static com.huawei.vectorblas.utils.ArrayUtil.loopBound;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class Ssymv {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+
+ public static void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset,
+ int incx, float beta, float[] y, int yOffset, int incy) {
+ BlasUtils.checkParameter("SSYMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ BlasUtils.checkParameter("SSYMV", 2, n >= 0);
+ BlasUtils.checkParameter("SSYMV", 5, lda >= Math.max(1, n));
+ BlasUtils.checkParameter("SSYMV", 7, incx != 0);
+ BlasUtils.checkParameter("SSYMV", 10, incy != 0);
+
+ if (n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) {
+ return;
+ }
+
+ BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length);
+ BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length);
+ BlasUtils.checkBlasArray("a", aOffset, (n - 1) + (n - 1) * lda, a.length);
+
+ boolean uploFlag = Lsame.lsame(uplo, "U");
+ int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx);
+ int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy);
+ if (Float.compare(beta, 1.0f) != 0) {
+ SblasLevel2.sMulBeta(n, beta, y, yOffset, incy);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ if (uploFlag) {
+ if (incx == 1 && incy == 1) {
+ vecSsymvU(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda);
+ } else {
+ norSsymvU(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex);
+ }
+ } else if (incx == 1 && incy == 1) {
+ vecSsymvL(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda);
+ } else {
+ norSsymvL(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex);
+ }
+ }
+
+ private static void vecSsymvU(int n, float[] x, int xOffset, float alpha, float[] y, int yOffset, float[] a,
+ int aOffset, int lda) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ float alphaMulX0 = alpha * x[col + xOffset];
+ float alphaMulX1 = alpha * x[(col + 1) + xOffset];
+ float alphaMulX2 = alpha * x[(col + 2) + xOffset];
+ float alphaMulX3 = alpha * x[(col + 3) + xOffset];
+ FloatVector alphaXv0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ FloatVector alphaXv1 = FloatVector.broadcast(SSPECIES, alphaMulX1);
+ FloatVector alphaXv2 = FloatVector.broadcast(SSPECIES, alphaMulX2);
+ FloatVector alphaXv3 = FloatVector.broadcast(SSPECIES, alphaMulX3);
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ FloatVector accumv1 = FloatVector.zero(SSPECIES);
+ FloatVector accumv2 = FloatVector.zero(SSPECIES);
+ FloatVector accumv3 = FloatVector.zero(SSPECIES);
+ int row = 0;
+ for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) {
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, row + xOffset);
+ yv = av0.fma(alphaXv0, yv);
+ yv = av1.fma(alphaXv1, yv);
+ yv = av2.fma(alphaXv2, yv);
+ av3.fma(alphaXv3, yv).intoArray(y, row + yOffset);
+ accumv0 = av0.fma(xv, accumv0);
+ accumv1 = av1.fma(xv, accumv1);
+ accumv2 = av2.fma(xv, accumv2);
+ accumv3 = av3.fma(xv, accumv3);
+ }
+ float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ float accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ float accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ float accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ float a0 = a[row + col * lda + aOffset];
+ float a1 = a[row + (col + 1) * lda + aOffset];
+ float a2 = a[row + (col + 2) * lda + aOffset];
+ float a3 = a[row + (col + 3) * lda + aOffset];
+ float x0 = x[row + xOffset];
+ y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset]
+ + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]);
+ accum0 += alpha * a0 * x0;
+ accum1 += alpha * a1 * x0;
+ accum2 += alpha * a2 * x0;
+ accum3 += alpha * a3 * x0;
+ }
+ float a00 = a[row + col * lda + aOffset];
+ float a01 = a[row + (col + 1) * lda + aOffset];
+ float a02 = a[row + (col + 2) * lda + aOffset];
+ float a03 = a[row + (col + 3) * lda + aOffset];
+ float a11 = a[(row + 1) + (col + 1) * lda + aOffset];
+ float a12 = a[(row + 1) + (col + 2) * lda + aOffset];
+ float a13 = a[(row + 1) + (col + 3) * lda + aOffset];
+ float a22 = a[(row + 2) + (col + 2) * lda + aOffset];
+ float a23 = a[(row + 2) + (col + 3) * lda + aOffset];
+ float a33 = a[(row + 3) + (col + 3) * lda + aOffset];
+ y[col + yOffset] += a00 * alphaMulX0 + a01 * alphaMulX1 + a02 * alphaMulX2 + a03 * alphaMulX3 + accum0;
+ y[(col + 1) + yOffset] += a01 * alphaMulX0 + a11 * alphaMulX1 + a12 * alphaMulX2 + a13 * alphaMulX3
+ + accum1;
+ y[(col + 2) + yOffset] += a02 * alphaMulX0 + a12 * alphaMulX1 + a22 * alphaMulX2 + a23 * alphaMulX3
+ + accum2;
+ y[(col + 3) + yOffset] += a03 * alphaMulX0 + a13 * alphaMulX1 + a23 * alphaMulX2 + a33 * alphaMulX3
+ + accum3;
+ }
+ for (; col < n; col++) {
+ float alphaMulX0 = alpha * x[col + xOffset];
+ FloatVector alphaXv0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ int row = 0;
+ for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) {
+ FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, row + xOffset);
+ av.fma(alphaXv0, yv).intoArray(y, row + yOffset);
+ accumv0 = av.fma(xv, accumv0);
+ }
+ float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ for (; row < col; row++) {
+ float a0 = a[row + col * lda + aOffset];
+ y[row + yOffset] += a0 * alphaMulX0;
+ accum0 += alpha * a0 * x[row + xOffset];
+ }
+ y[col + yOffset] += a[row + col * lda + aOffset] * alphaMulX0 + accum0;
+ }
+ }
+
+ private static void norSsymvU(int n, float[] x, int xOffset, int incx, float alpha, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) {
+ for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) {
+ float alphaMulX = alpha * x[xj + xOffset];
+ float accum = 0.0f;
+
+ for (int row = 0, xIndx = xStartIndex, yIndx = yStartIndex; row < col; row++, xIndx += incx,
+ yIndx += incy) {
+ y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ accum += a[row + col * lda + aOffset] * x[xIndx + xOffset];
+ }
+ y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset] + alpha * accum;
+ }
+ }
+
+ private static void vecSsymvL(int n, float[] x, int xOffset, float alpha, float[] y, int yOffset, float[] a,
+ int aOffset, int lda) {
+ int col = 0;
+ int colLoopBound = loopBound(n, 4);
+ for (; col < colLoopBound; col += 4) { // 4 is unroll size for column
+ int row = col;
+ float a00 = a[aOffset + row + col * lda];
+ float a10 = a[aOffset + (row + 1) + col * lda];
+ float a20 = a[aOffset + (row + 2) + col * lda];
+ float a30 = a[aOffset + (row + 3) + col * lda];
+ float a11 = a[aOffset + (row + 1) + (col + 1) * lda];
+ float a21 = a[aOffset + (row + 2) + (col + 1) * lda];
+ float a31 = a[aOffset + (row + 3) + (col + 1) * lda];
+ float a22 = a[aOffset + (row + 2) + (col + 2) * lda];
+ float a32 = a[aOffset + (row + 3) + (col + 2) * lda];
+ float a33 = a[aOffset + (row + 3) + (col + 3) * lda];
+ float alphaMulX0 = alpha * x[xOffset + col];
+ float alphaMulX1 = alpha * x[xOffset + (col + 1)];
+ float alphaMulX2 = alpha * x[xOffset + (col + 2)];
+ float alphaMulX3 = alpha * x[xOffset + (col + 3)];
+ float accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30;
+ float accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31;
+ float accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32;
+ float accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33;
+ FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0);
+ FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1);
+ FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2);
+ FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3);
+ FloatVector accumv0 = FloatVector.zero(SSPECIES);
+ FloatVector accumv1 = FloatVector.zero(SSPECIES);
+ FloatVector accumv2 = FloatVector.zero(SSPECIES);
+ FloatVector accumv3 = FloatVector.zero(SSPECIES);
+ row += 4;
+ for (; row <= (n - n % SSPECIES.length() - SSPECIES.length()); row += SSPECIES.length()) {
+ FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * lda);
+ FloatVector av1 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 1) * lda);
+ FloatVector av2 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 2) * lda);
+ FloatVector av3 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 3) * lda);
+ FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row);
+ FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row);
+ yv = alphaMulXV0.fma(av0, yv);
+ yv = alphaMulXV1.fma(av1, yv);
+ yv = alphaMulXV2.fma(av2, yv);
+ alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row);
+ accumv0 = xv.fma(av0, accumv0);
+ accumv1 = xv.fma(av1, accumv1);
+ accumv2 = xv.fma(av2, accumv2);
+ accumv3 = xv.fma(av3, accumv3);
+ }
+ accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD);
+ accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD);
+ accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD);
+ accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD);
+ for (; row < n; row += 1) {
+ float a0 = a[aOffset + row + col * lda];
+ float a1 = a[aOffset + row + (col + 1) * lda];
+ float a2 = a[aOffset + row + (col + 2) * lda];
+ float a3 = a[aOffset + row + (col + 3) * lda];
+ y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3;
+ accum0 += alpha * x[xOffset + row] * a0;
+ accum1 += alpha * x[xOffset + row] * a1;
+ accum2 += alpha * x[xOffset + row] * a2;
+ accum3 += alpha * x[xOffset + row] * a3;
+ }
+ y[yOffset + col] += accum0;
+ y[yOffset + (col + 1)] += accum1;
+ y[yOffset + (col + 2)] += accum2;
+ y[yOffset + (col + 3)] += accum3;
+ }
+ for (; col < n; col += 1) {
+ float alphaMulX0 = alpha * x[xOffset + col];
+ y[yOffset + col] += a[aOffset + col + col * lda] * alphaMulX0;
+ int row = col + 1;
+ float accum0 = 0.0f;
+ for (; row < n; row++) {
+ float a0 = a[aOffset + row + col * lda];
+ y[yOffset + row] += a0 * alphaMulX0;
+ accum0 += x[xOffset + row] * a0;
+ }
+ y[yOffset + col] += alpha * accum0;
+ }
+ }
+
+ private static void norSsymvL(int n, float[] x, int xOffset, int incx, float alpha, float[] y, int yOffset,
+ int incy, float[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) {
+ for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) {
+ float alphaMulX = alpha * x[xj + xOffset];
+ y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset];
+ float accum = 0.0f;
+
+ for (int row = col + 1, xIndx = xj + incx, yIndx = yj + incy; row < n; row++, xIndx += incx,
+ yIndx += incy) {
+ y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset];
+ accum += a[row + col * lda + aOffset] * x[xIndx + xOffset];
+ }
+ y[yj + yOffset] += alpha * accum;
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java
new file mode 100644
index 0000000000000000000000000000000000000000..df6ab57a6a09bbce340db416cf1bce0057c7107d
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java
@@ -0,0 +1,475 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.doubleprecision;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class DblasLevel3 {
+ private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX;
+ protected static final int DGEMM_P = 256; // Blocking size for m direction.
+ protected static final int DGEMM_Q = 240; // Blocking size for k direction.
+ protected static final int DGEMM_R = 8192; // Blocking size for n direction.
+ protected static final int VECTOR_LENGTH = DSPECIES.length();
+ protected static final int VECTOR_LENGTH2 = 2 * VECTOR_LENGTH; // 2 times vector length
+ protected static final int VECTOR_LENGTH3 = 3 * VECTOR_LENGTH; // 3 times vector length
+ protected static final int VECTOR_LENGTH4 = 4 * VECTOR_LENGTH; // 4 times vector length
+ protected static final int DGEMM_UNROLL_M = 4 * VECTOR_LENGTH; // Kernel size for m is 4 * DSPECIES.length().
+ protected static final int DGEMM_UNROLL_N = 4; // Kernel size for n direction is 4.
+
+ protected static void betaMulC(int sizeM, int sizeN, double beta, double[] dc, int cOffset, int ldc) {
+ DoubleVector betav = DoubleVector.broadcast(DSPECIES, beta);
+ for (int col = 0; col < sizeN; col++) {
+ int row = 0;
+ for (; row < sizeM - VECTOR_LENGTH; row += VECTOR_LENGTH) {
+ DoubleVector cv = DoubleVector.fromArray(DSPECIES, dc, row + col * ldc + cOffset);
+ cv.mul(betav).intoArray(dc, row + col * ldc + cOffset);
+ }
+ for (; row < sizeM; row++) {
+ dc[row + col * ldc + cOffset] *= beta;
+ }
+ }
+ }
+
+ protected static void kernelOperation8x4(int mc, int nc, int kc, double alpha, double[] da, double[] db,
+ int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) {
+ kernelOperation8x4Main(mc, nc, kc, alpha, da, db, bOffset, dc, ldc, cOffset, csRow, csCol);
+ kernelOperation8x4NBorder(mc, nc, kc, alpha, da, db, bOffset, dc, ldc, cOffset, csRow, csCol);
+ }
+
+ private static void kernelOperation8x4NBorder(int mc, int nc, int kc, double alpha, double[] da, double[] db,
+ int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) {
+ DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha);
+ int cCol = csCol + (nc / DGEMM_UNROLL_N) * DGEMM_UNROLL_N;
+ int countJ = nc % DGEMM_UNROLL_N;
+ for (; countJ > 0; countJ--) {
+ int cRow = csRow;
+ int aIndx = 0;
+ int countI = mc / DGEMM_UNROLL_M;
+ for (; countI > 0; countI--) {
+ int bIndx = (nc - countJ) * kc;
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ DoubleVector c10 = DoubleVector.zero(DSPECIES);
+ DoubleVector c20 = DoubleVector.zero(DSPECIES);
+ DoubleVector c30 = DoubleVector.zero(DSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+ DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH);
+ DoubleVector a2 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH2);
+ DoubleVector a3 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH3);
+
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c20 = a2.fma(b0, c20);
+ c30 = a3.fma(b0, c30);
+
+ aIndx += DGEMM_UNROLL_M;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+ alphaVec.fma(c20, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2
+ + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc);
+ alphaVec.fma(c30, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3
+ + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc);
+
+ cRow += DGEMM_UNROLL_M;
+ }
+ countI = mc % DGEMM_UNROLL_M;
+ if (countI >= VECTOR_LENGTH2) {
+ int bIndx = (nc - countJ) * kc;
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ DoubleVector c10 = DoubleVector.zero(DSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+ DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH);
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+
+ aIndx += VECTOR_LENGTH2;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+
+ cRow += VECTOR_LENGTH2;
+ countI -= VECTOR_LENGTH2;
+ }
+ if (countI >= VECTOR_LENGTH) {
+ int bIndx = (nc - countJ) * kc;
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+ c00 = a0.fma(b0, c00);
+ aIndx += VECTOR_LENGTH;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc,
+ cOffset + cRow + cCol * ldc);
+
+ cRow += VECTOR_LENGTH;
+ countI -= VECTOR_LENGTH;
+ }
+ while (countI > 0) {
+ int bIndx = (nc - countJ) * kc;
+ double[] cTmp = new double[1];
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ cTmp[0] += da[aIndx] * db[bIndx];
+ aIndx += 1;
+ bIndx += 1;
+ }
+ dc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0];
+
+ cRow += 1;
+ countI -= 1;
+ }
+ cCol += 1;
+ }
+ }
+
+ private static void kernelOperation8x4Main(int mc, int nc, int kc, double alpha, double[] da, double[] db,
+ int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) {
+ DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha);
+ int countJ = nc / DGEMM_UNROLL_N;
+ int cCol = csCol;
+ for (; countJ > 0; countJ--) {
+ int cRow = csRow;
+ int aIndx = 0;
+ int countI = mc / DGEMM_UNROLL_M;
+ for (; countI > 0; countI--) {
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ DoubleVector c10 = DoubleVector.zero(DSPECIES);
+ DoubleVector c20 = DoubleVector.zero(DSPECIES);
+ DoubleVector c30 = DoubleVector.zero(DSPECIES);
+ DoubleVector c01 = DoubleVector.zero(DSPECIES);
+ DoubleVector c11 = DoubleVector.zero(DSPECIES);
+ DoubleVector c21 = DoubleVector.zero(DSPECIES);
+ DoubleVector c31 = DoubleVector.zero(DSPECIES);
+ DoubleVector c02 = DoubleVector.zero(DSPECIES);
+ DoubleVector c12 = DoubleVector.zero(DSPECIES);
+ DoubleVector c22 = DoubleVector.zero(DSPECIES);
+ DoubleVector c32 = DoubleVector.zero(DSPECIES);
+ DoubleVector c03 = DoubleVector.zero(DSPECIES);
+ DoubleVector c13 = DoubleVector.zero(DSPECIES);
+ DoubleVector c23 = DoubleVector.zero(DSPECIES);
+ DoubleVector c33 = DoubleVector.zero(DSPECIES);
+ int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc;
+
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+ DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH);
+ DoubleVector a2 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH2);
+ DoubleVector a3 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH3);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c20 = a2.fma(b0, c20);
+ c30 = a3.fma(b0, c30);
+
+ DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]);
+ c01 = a0.fma(b1, c01);
+ c11 = a1.fma(b1, c11);
+ c21 = a2.fma(b1, c21);
+ c31 = a3.fma(b1, c31);
+
+ DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]);
+ c02 = a0.fma(b2, c02);
+ c12 = a1.fma(b2, c12);
+ c22 = a2.fma(b2, c22);
+ c32 = a3.fma(b2, c32);
+
+ DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]);
+ c03 = a0.fma(b3, c03);
+ c13 = a1.fma(b3, c13);
+ c23 = a2.fma(b3, c23);
+ c33 = a3.fma(b3, c33);
+ aIndx += DGEMM_UNROLL_M;
+ bIndx += DGEMM_UNROLL_N;
+ }
+ DoubleVector cOri00 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc);
+ DoubleVector cOri10 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc);
+ DoubleVector cOri20 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2
+ + cCol * ldc);
+ DoubleVector cOri30 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3
+ + cCol * ldc);
+
+ cOri00 = alphaVec.fma(c00, cOri00);
+ cOri10 = alphaVec.fma(c10, cOri10);
+ cOri20 = alphaVec.fma(c20, cOri20);
+ cOri30 = alphaVec.fma(c30, cOri30);
+
+ cOri00.intoArray(dc, cOffset + cRow + cCol * ldc);
+ cOri10.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+ cOri20.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc);
+ cOri30.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc);
+
+ DoubleVector cOri01 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc);
+ DoubleVector cOri11 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 1) * ldc);
+ DoubleVector cOri21 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 1) * ldc);
+ DoubleVector cOri31 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 1) * ldc);
+
+ cOri01 = alphaVec.fma(c01, cOri01);
+ cOri11 = alphaVec.fma(c11, cOri11);
+ cOri21 = alphaVec.fma(c21, cOri21);
+ cOri31 = alphaVec.fma(c31, cOri31);
+
+ cOri01.intoArray(dc, cOffset + cRow + (cCol + 1) * ldc);
+ cOri11.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc);
+ cOri21.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 1) * ldc);
+ cOri31.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 1) * ldc);
+
+ DoubleVector cOri02 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc);
+ DoubleVector cOri12 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 2) * ldc);
+ DoubleVector cOri22 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 2) * ldc);
+ DoubleVector cOri32 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 2) * ldc);
+
+ cOri02 = alphaVec.fma(c02, cOri02);
+ cOri12 = alphaVec.fma(c12, cOri12);
+ cOri22 = alphaVec.fma(c22, cOri22);
+ cOri32 = alphaVec.fma(c32, cOri32);
+
+ cOri02.intoArray(dc, cOffset + cRow + (cCol + 2) * ldc);
+ cOri12.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc);
+ cOri22.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 2) * ldc);
+ cOri32.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 2) * ldc);
+
+ DoubleVector cOri03 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc);
+ DoubleVector cOri13 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 3) * ldc);
+ DoubleVector cOri23 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 3) * ldc);
+ DoubleVector cOri33 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 3) * ldc);
+
+ cOri03 = alphaVec.fma(c03, cOri03);
+ cOri13 = alphaVec.fma(c13, cOri13);
+ cOri23 = alphaVec.fma(c23, cOri23);
+ cOri33 = alphaVec.fma(c33, cOri33);
+
+ cOri03.intoArray(dc, cOffset + cRow + (cCol + 3) * ldc);
+ cOri13.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc);
+ cOri23.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 3) * ldc);
+ cOri33.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 3) * ldc);
+
+ cRow += DGEMM_UNROLL_M;
+ }
+ countI = mc % DGEMM_UNROLL_M;
+ if (countI >= VECTOR_LENGTH2) {
+ int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc;
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ DoubleVector c01 = DoubleVector.zero(DSPECIES);
+ DoubleVector c02 = DoubleVector.zero(DSPECIES);
+ DoubleVector c03 = DoubleVector.zero(DSPECIES);
+ DoubleVector c10 = DoubleVector.zero(DSPECIES);
+ DoubleVector c11 = DoubleVector.zero(DSPECIES);
+ DoubleVector c12 = DoubleVector.zero(DSPECIES);
+ DoubleVector c13 = DoubleVector.zero(DSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+ DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH);
+
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+ DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]);
+ DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]);
+ DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c01 = a0.fma(b1, c01);
+ c11 = a1.fma(b1, c11);
+
+ c02 = a0.fma(b2, c02);
+ c12 = a1.fma(b2, c12);
+ c03 = a0.fma(b3, c03);
+ c13 = a1.fma(b3, c13);
+
+ aIndx += VECTOR_LENGTH2;
+ bIndx += DGEMM_UNROLL_N;
+ }
+ alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+
+ alphaVec.fma(c01, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 1) * ldc);
+ alphaVec.fma(c11, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 1) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc);
+
+ alphaVec.fma(c02, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 2) * ldc);
+ alphaVec.fma(c12, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 2) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc);
+
+ alphaVec.fma(c03, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 3) * ldc);
+ alphaVec.fma(c13, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 3) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc);
+
+ cRow += VECTOR_LENGTH2;
+ countI -= VECTOR_LENGTH2;
+ }
+ if (countI >= VECTOR_LENGTH) {
+ int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc;
+ DoubleVector c00 = DoubleVector.zero(DSPECIES);
+ DoubleVector c01 = DoubleVector.zero(DSPECIES);
+ DoubleVector c02 = DoubleVector.zero(DSPECIES);
+ DoubleVector c03 = DoubleVector.zero(DSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx);
+
+ DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]);
+ DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]);
+ DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]);
+ DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]);
+
+ c00 = a0.fma(b0, c00);
+ c01 = a0.fma(b1, c01);
+ c02 = a0.fma(b2, c02);
+ c03 = a0.fma(b3, c03);
+
+ aIndx += VECTOR_LENGTH;
+ bIndx += DGEMM_UNROLL_N;
+ }
+ alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c01, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 1) * ldc);
+ alphaVec.fma(c02, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 2) * ldc);
+ alphaVec.fma(c03, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(dc,
+ cOffset + cRow + (cCol + 3) * ldc);
+
+ cRow += VECTOR_LENGTH;
+ countI -= VECTOR_LENGTH;
+ }
+ while (countI > 0) {
+ int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc;
+ double[] cTmp = new double[DGEMM_UNROLL_N];
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ cTmp[0] += da[aIndx] * db[bOffset + bIndx];
+ cTmp[1] += da[aIndx] * db[bOffset + bIndx + 1];
+ cTmp[2] += da[aIndx] * db[bOffset + bIndx + 2];
+ cTmp[3] += da[aIndx] * db[bOffset + bIndx + 3];
+ aIndx += 1;
+ bIndx += DGEMM_UNROLL_N;
+ }
+ dc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0];
+ dc[cOffset + cRow + (cCol + 1) * ldc] += alpha * cTmp[1];
+ dc[cOffset + cRow + (cCol + 2) * ldc] += alpha * cTmp[2];
+ dc[cOffset + cRow + (cCol + 3) * ldc] += alpha * cTmp[3];
+
+ cRow += 1;
+ countI -= 1;
+ }
+ cCol += DGEMM_UNROLL_N;
+ }
+ }
+
+ /**
+ * onCopy is used for normally packing matrix in the right.
+ * For example, when DGEMM_UNROLL_N = 4,
+ * before packing after packing
+ * 1 6 11 16 21 1 2 3 4 21
+ * 2 7 12 17 22 ---> 5 6 7 8 22
+ * 3 8 13 18 23 9 10 11 12 23
+ * 4 9 14 19 24 13 14 15 16 24
+ * 5 10 15 20 25 17 18 19 20 25
+ */
+ protected static void onCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ double[] dst, int dstOffset) {
+ int col = 0;
+ int colPackSize = DGEMM_UNROLL_N;
+ int dstIndex = 0;
+ for (; col < sizeN - sizeN % colPackSize; col += colPackSize) {
+ int row = 0;
+ for (; row < sizeM; row += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 1] = src[(srcRow + row) + (srcCol + (col + 1)) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 2] = src[(srcRow + row) + (srcCol + (col + 2)) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 3] = src[(srcRow + row) + (srcCol + (col + 3)) * srcLd + srcOffset];
+ dstIndex += colPackSize;
+ }
+ }
+ for (; col < sizeN; col += 1) {
+ int row = 0;
+ for (; row < sizeM; row += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dstIndex += 1;
+ }
+ }
+ }
+
+ /**
+ * itCopy is used for transpose packing matrix in the left.
+ * For example, when DGEMM_UNROLL_M = 4,
+ * before packing after packing
+ * 1 6 11 16 21 1 5 9 13 17
+ * 2 7 12 17 22 ---> 2 6 10 14 18
+ * 3 8 13 18 23 3 7 11 15 19
+ * 4 9 14 19 24 4 8 12 16 20
+ * 5 10 15 20 25 21 22 23 24 25
+ */
+ protected static void itCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ double[] dst, int dstOffset) {
+ int row = 0;
+ int dstIndex = 0;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (row + vectorLen <= sizeM) {
+ int col = 0;
+ for (; col < sizeN; col++) {
+ System.arraycopy(src, srcOffset + (srcRow + row) + (srcCol + col) * srcLd, dst,
+ dstOffset + dstIndex, vectorLen);
+ dstIndex += vectorLen;
+ }
+ row += vectorLen;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java
new file mode 100644
index 0000000000000000000000000000000000000000..4cae10440b3b418283c69c7782853c22c7b78475
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.doubleprecision;
+
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_P;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_Q;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_R;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_UNROLL_N;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH2;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH4;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Dgemm {
+ public static void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset,
+ int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ BlasUtils.checkParameter("DGEMM", 1, Lsame.lsame(transa, "N") || Lsame.lsame(transa, "T"));
+ BlasUtils.checkParameter("DGEMM", 2, Lsame.lsame(transb, "N") || Lsame.lsame(transb, "T"));
+ boolean transaFlag = Lsame.lsame(transa, "N");
+ boolean transbFlag = Lsame.lsame(transb, "N");
+ BlasUtils.checkParameter("DGEMM", 3, m >= 0);
+ BlasUtils.checkParameter("DGEMM", 4, n >= 0);
+ BlasUtils.checkParameter("DGEMM", 5, k >= 0);
+ BlasUtils.checkParameter("DGEMM", 8, lda >= Math.max(1, (transaFlag ? m : k)));
+ BlasUtils.checkParameter("DGEMM", 10, ldb >= Math.max(1, (transbFlag ? k : n)));
+ BlasUtils.checkParameter("DGEMM", 13, ldc >= Math.max(1, m));
+
+ if (m == 0 || n == 0) {
+ return;
+ }
+ if (Double.compare(beta, 1.0d) != 0) {
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ DblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc);
+ }
+ if (BlasUtils.isZero(alpha) || k == 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("a", aOffset, ((transaFlag ? m : k) - 1) + ((transaFlag ? k : m) - 1) * lda, a.length);
+ BlasUtils.checkBlasArray("b", bOffset, ((transbFlag ? k : n) - 1) + ((transbFlag ? n : k) - 1) * ldb, b.length);
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ dgemmVector(transa, transb, m, n, k, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc);
+ }
+
+ private static void dgemmVector(String transa, String transb, int sizeM, int sizeN, int sizeK, double[] da,
+ int aOffset, int lda, double alpha, double[] db, int bOffset, int ldb, double[] dc, int cOffset, int ldc) {
+ int mc = Math.min(DGEMM_P, sizeM);
+ int nc = Math.min(DGEMM_R, sizeN);
+ int kc = Math.min(DGEMM_Q, sizeK);
+ double[] packa = new double[kc * mc];
+ double[] packb = new double[kc * nc];
+ for (int ns = 0; ns < sizeN; ns += nc) {
+ nc = Math.min(nc, sizeN - ns);
+ for (int ks = 0; ks < sizeK; ks += kc) {
+ kc = Math.min(kc, sizeK - ks);
+ if (Lsame.lsame(transb, "N")) {
+ DblasLevel3.onCopy(kc, nc, db, ks, ns, bOffset, ldb, packb, 0); // packing matrix b
+ } else {
+ otCopy(nc, kc, db, ns, ks, bOffset, ldb, packb, 0);
+ }
+ for (int ms = 0; ms < sizeM; ms += mc) {
+ mc = Math.min(mc, sizeM - ms);
+ if (Lsame.lsame(transa, "N")) {
+ DblasLevel3.itCopy(mc, kc, da, ms, ks, aOffset, lda, packa, 0); // packing matrix a
+ } else {
+ inCopy(kc, mc, da, ks, ms, aOffset, lda, packa, 0);
+ }
+ DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns);
+ }
+ }
+ }
+ }
+
+ /**
+ * otCopy method is used for transpose packing matrix in the right.
+ * For example, when DGEMM_UNROLL_N = 4,
+ * before packing after packing
+ * 1 6 11 16 1 5 9 13
+ * 2 7 12 17 ---> 2 6 10 14
+ * 3 8 13 18 3 7 11 15
+ * 4 9 14 19 4 8 12 16
+ * 5 10 15 20 17 18 19 20
+ */
+ private static void otCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ double[] dst, int dstOffset) {
+ int row = 0;
+ int colPackSize = DGEMM_UNROLL_N;
+ int dstIndex = 0;
+ for (; row < sizeM - sizeM % colPackSize; row += colPackSize) {
+ int col = 0;
+ for (; col < sizeN; col += 1) {
+ System.arraycopy(src, (srcRow + row) + (srcCol + col) * srcLd + srcOffset, dst,
+ dstOffset + dstIndex, DGEMM_UNROLL_N);
+ dstIndex += colPackSize;
+ }
+ }
+ for (; row < sizeM; row += 1) {
+ int col = 0;
+ for (; col < sizeN; col += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dstIndex += 1;
+ }
+ }
+ }
+
+ /**
+ * inCopy is used for normally packing matrix in the left.
+ * For example, when DGEMM_UNROLL_M = 4,
+ * before packing after packing
+ * 1 6 11 16 21 1 2 3 4 21
+ * 2 7 12 17 22 ---> 5 6 7 8 22
+ * 3 8 13 18 23 9 10 11 12 23
+ * 4 9 14 19 24 13 14 15 16 24
+ * 5 10 15 20 25 17 18 19 20 25
+ */
+ private static void inCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ double[] dst, int dstOffset) {
+ int col = 0;
+ int dstIndex = 0;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (col + vectorLen <= sizeN) {
+ int row = 0;
+ for (; row < sizeM; row++) {
+ for (int count = 0; count < vectorLen; count++) {
+ dst[dstOffset + dstIndex + count] = src[srcOffset + (srcRow + row) + (srcCol + (col + count))
+ * srcLd];
+ }
+ dstIndex += vectorLen;
+ }
+ col += vectorLen;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java
new file mode 100644
index 0000000000000000000000000000000000000000..2a1f9d8f051b1d2349de4d4ac619b5d4d0548133
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.doubleprecision;
+
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_P;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_Q;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_R;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_UNROLL_N;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH2;
+import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH4;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Dsymm {
+ public static void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda,
+ double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) {
+ BlasUtils.checkParameter("DSYMM", 1, Lsame.lsame(side, "L") || Lsame.lsame(side, "R"));
+ BlasUtils.checkParameter("DSYMM", 2, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ boolean sideFlag = Lsame.lsame(side, "L");
+ BlasUtils.checkParameter("DSYMM", 3, m >= 0);
+ BlasUtils.checkParameter("DSYMM", 4, n >= 0);
+ BlasUtils.checkParameter("DSYMM", 7, lda >= Math.max(1, (sideFlag ? m : n)));
+ BlasUtils.checkParameter("DSYMM", 9, ldb >= Math.max(1, m));
+ BlasUtils.checkParameter("DSYMM", 12, ldc >= Math.max(1, m));
+
+ if (m == 0 || n == 0) {
+ return;
+ }
+ if (Double.compare(beta, 1.0d) != 0) {
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ DblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ BlasUtils.checkBlasArray("a", aOffset, ((sideFlag ? m : n) - 1) + ((sideFlag ? m : n) - 1) * lda, a.length);
+ BlasUtils.checkBlasArray("b", bOffset, (m - 1) + (n - 1) * ldb, b.length);
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ dsymmVector(side, uplo, m, n, sideFlag ? m : n, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc);
+ }
+
+ private static void dsymmVector(String side, String uplo, int sizeM, int sizeN, int sizeK, double[] da, int aOffset,
+ int lda, double alpha, double[] db, int bOffset, int ldb, double[] dc, int cOffset, int ldc) {
+ int mc = Math.min(DGEMM_P, sizeM);
+ int nc = Math.min(DGEMM_R, sizeN);
+ int kc = Math.min(DGEMM_Q, sizeK);
+ boolean sideFlag = Lsame.lsame(side, "L");
+ double[] packa = new double[kc * (sideFlag ? mc : nc)];
+ double[] packb = new double[kc * (sideFlag ? nc : mc)];
+ for (int ns = 0; ns < sizeN; ns += nc) {
+ nc = Math.min(nc, sizeN - ns);
+ for (int ks = 0; ks < sizeK; ks += kc) {
+ kc = Math.min(kc, sizeK - ks);
+ if (Lsame.lsame(side, "L")) {
+ DblasLevel3.onCopy(kc, nc, db, ks, ns, bOffset, ldb, packb, 0);
+ } else if (Lsame.lsame(side, "R") && Lsame.lsame(uplo, "U")) {
+ outCopy(kc, nc, da, aOffset, lda, packa, 0, ns, ks);
+ } else {
+ oltCopy(kc, nc, da, aOffset, lda, packa, 0, ns, ks);
+ }
+ for (int ms = 0; ms < sizeM; ms += mc) {
+ mc = Math.min(mc, sizeM - ms);
+ if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "U")) {
+ iutCopy(kc, mc, da, aOffset, lda, packa, 0, ms, ks);
+ DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns);
+ } else if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "L")) {
+ iltCopy(kc, mc, da, aOffset, lda, packa, 0, ms, ks);
+ DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns);
+ } else {
+ DblasLevel3.itCopy(mc, kc, db, ms, ks, bOffset, ldb, packb, 0);
+ DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packb, packa, 0, dc, ldc, cOffset, ms, ns);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * oltCopy method is used for packing lower matrix in the right.
+ */
+ private static void oltCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLenthList = {DGEMM_UNROLL_N, 1};
+ for (int vectorLen : vectorLenthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posX + index + posY * srcLd;
+ } else {
+ offset[index] = posY + (posX + index) * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index] += srcLd;
+ } else {
+ offset[index]++;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * outCopy method is used for packing upper matrix in the right.
+ */
+ private static void outCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLenthList = {DGEMM_UNROLL_N, 1};
+ for (int vectorLen : vectorLenthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posY + (posX + index) * srcLd;
+ } else {
+ offset[index] = posX + index + posY * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index]++;
+ } else {
+ offset[index] += srcLd;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * iltCopy method is used for packing lower matrix in the left.
+ */
+ private static void iltCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posX + index + posY * srcLd;
+ } else {
+ offset[index] = posY + (posX + index) * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index] += srcLd;
+ } else {
+ offset[index]++;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * iutCopy method is used for packing upper matrix in the left.
+ */
+ private static void iutCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posY + (posX + index) * srcLd;
+ } else {
+ offset[index] = posX + index + posY * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index]++;
+ } else {
+ offset[index] += srcLd;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java
new file mode 100644
index 0000000000000000000000000000000000000000..b1ad539f00ca1859edd8115bf4aaa69b6c024894
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java
@@ -0,0 +1,460 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.singleprecision;
+
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class SblasLevel3 {
+ private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX;
+ protected static final int SGEMM_P = 256; // Blocking size for m direction.
+ protected static final int SGEMM_Q = 256; // Blocking size for k direction.
+ protected static final int SGEMM_R = 8192; // Blocking size for n direction.
+ protected static final int VECTOR_LENGTH = SSPECIES.length();
+ protected static final int VECTOR_LENGTH2 = 2 * VECTOR_LENGTH;
+ protected static final int VECTOR_LENGTH3 = 3 * VECTOR_LENGTH;
+ protected static final int VECTOR_LENGTH4 = 4 * VECTOR_LENGTH;
+ protected static final int SGEMM_UNROLL_M = 4 * VECTOR_LENGTH;
+ protected static final int SGEMM_UNROLL_N = 4;
+
+ protected static void betaMulC(int sizeM, int sizeN, float beta, float[] sc, int cOffset, int ldc) {
+ FloatVector betav = FloatVector.broadcast(SSPECIES, beta);
+ for (int col = 0; col < sizeN; col++) {
+ int row = 0;
+ for (; row < sizeM - VECTOR_LENGTH; row += VECTOR_LENGTH) {
+ FloatVector cv = FloatVector.fromArray(SSPECIES, sc, row + col * ldc + cOffset);
+ cv.mul(betav).intoArray(sc, row + col * ldc + cOffset);
+ }
+ for (; row < sizeM; row++) {
+ sc[row + col * ldc + cOffset] *= beta;
+ }
+ }
+ }
+
+ protected static void kernelOperation16x4(int mc, int nc, int kc, float alpha, float[] sa, float[] sb, int bOffset,
+ float[] sc, int ldc, int cOffset, int csRow, int csCol) {
+ kernelOperation16x4Main(mc, nc, kc, alpha, sa, sb, bOffset, sc, ldc, cOffset, csRow, csCol);
+ kernelOperation16x4NBorder(mc, nc, kc, alpha, sa, sb, bOffset, sc, ldc, cOffset, csRow, csCol);
+ }
+
+ private static void kernelOperation16x4NBorder(int mc, int nc, int kc, float alpha, float[] sa, float[] sb,
+ int bOffset, float[] sc, int ldc, int cOffset, int csRow, int csCol) {
+ FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha);
+ int cCol = csCol + (nc / SGEMM_UNROLL_N) * SGEMM_UNROLL_N;
+ int countJ = nc % SGEMM_UNROLL_N;
+ for (; countJ > 0; countJ--) {
+ int cRow = csRow;
+ int aIndx = 0;
+ int countI = mc / SGEMM_UNROLL_M;
+ for (; countI > 0; countI--) {
+ int bIndx = (nc - countJ) * kc;
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ FloatVector c10 = FloatVector.zero(SSPECIES);
+ FloatVector c20 = FloatVector.zero(SSPECIES);
+ FloatVector c30 = FloatVector.zero(SSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+ FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH);
+ FloatVector a2 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH2);
+ FloatVector a3 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH3);
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c20 = a2.fma(b0, c20);
+ c30 = a3.fma(b0, c30);
+
+ aIndx += SGEMM_UNROLL_M;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+ alphaVec.fma(c20, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2
+ + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc);
+ alphaVec.fma(c30, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3
+ + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc);
+
+ cRow += SGEMM_UNROLL_M;
+ }
+ countI = mc % SGEMM_UNROLL_M;
+ if (countI >= VECTOR_LENGTH2) {
+ int bIndx = (nc - countJ) * kc;
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ FloatVector c10 = FloatVector.zero(SSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+ FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH);
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+
+ aIndx += VECTOR_LENGTH2;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+
+ cRow += VECTOR_LENGTH2;
+ countI -= VECTOR_LENGTH2;
+ }
+ if (countI >= VECTOR_LENGTH) {
+ int bIndx = (nc - countJ) * kc;
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+ c00 = a0.fma(b0, c00);
+ aIndx += VECTOR_LENGTH;
+ bIndx += 1;
+ }
+ alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc,
+ cOffset + cRow + cCol * ldc);
+
+ cRow += VECTOR_LENGTH;
+ countI -= VECTOR_LENGTH;
+ }
+ while (countI > 0) {
+ int bIndx = (nc - countJ) * kc;
+ float[] cTmp = new float[1];
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ cTmp[0] += sa[aIndx] * sb[bIndx];
+ aIndx += 1;
+ bIndx += 1;
+ }
+ sc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0];
+
+ cRow += 1;
+ countI -= 1;
+ }
+ cCol += 1;
+ }
+ }
+
+ private static void kernelOperation16x4Main(int mc, int nc, int kc, float alpha, float[] sa, float[] sb,
+ int bOffset, float[] sc, int ldc, int cOffset, int csRow, int csCol) {
+ FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha);
+ int countJ = nc / SGEMM_UNROLL_N;
+ int cCol = csCol;
+ for (; countJ > 0; countJ--) {
+ int cRow = csRow;
+ int aIndx = 0;
+ int countI = mc / SGEMM_UNROLL_M;
+ for (; countI > 0; countI--) {
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ FloatVector c10 = FloatVector.zero(SSPECIES);
+ FloatVector c20 = FloatVector.zero(SSPECIES);
+ FloatVector c30 = FloatVector.zero(SSPECIES);
+ FloatVector c01 = FloatVector.zero(SSPECIES);
+ FloatVector c11 = FloatVector.zero(SSPECIES);
+ FloatVector c21 = FloatVector.zero(SSPECIES);
+ FloatVector c31 = FloatVector.zero(SSPECIES);
+ FloatVector c02 = FloatVector.zero(SSPECIES);
+ FloatVector c12 = FloatVector.zero(SSPECIES);
+ FloatVector c22 = FloatVector.zero(SSPECIES);
+ FloatVector c32 = FloatVector.zero(SSPECIES);
+ FloatVector c03 = FloatVector.zero(SSPECIES);
+ FloatVector c13 = FloatVector.zero(SSPECIES);
+ FloatVector c23 = FloatVector.zero(SSPECIES);
+ FloatVector c33 = FloatVector.zero(SSPECIES);
+ int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc;
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+ FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH);
+ FloatVector a2 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH2);
+ FloatVector a3 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH3);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c20 = a2.fma(b0, c20);
+ c30 = a3.fma(b0, c30);
+
+ FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]);
+ c01 = a0.fma(b1, c01);
+ c11 = a1.fma(b1, c11);
+ c21 = a2.fma(b1, c21);
+ c31 = a3.fma(b1, c31);
+
+ FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]);
+ c02 = a0.fma(b2, c02);
+ c12 = a1.fma(b2, c12);
+ c22 = a2.fma(b2, c22);
+ c32 = a3.fma(b2, c32);
+
+ FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]);
+ c03 = a0.fma(b3, c03);
+ c13 = a1.fma(b3, c13);
+ c23 = a2.fma(b3, c23);
+ c33 = a3.fma(b3, c33);
+ aIndx += SGEMM_UNROLL_M;
+ bIndx += SGEMM_UNROLL_N;
+ }
+ FloatVector cOri00 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc);
+ FloatVector cOri10 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc);
+ FloatVector cOri20 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2
+ + cCol * ldc);
+ FloatVector cOri30 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3
+ + cCol * ldc);
+
+ cOri00 = alphaVec.fma(c00, cOri00);
+ cOri10 = alphaVec.fma(c10, cOri10);
+ cOri20 = alphaVec.fma(c20, cOri20);
+ cOri30 = alphaVec.fma(c30, cOri30);
+
+ cOri00.intoArray(sc, cOffset + cRow + cCol * ldc);
+ cOri10.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+ cOri20.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc);
+ cOri30.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc);
+
+ FloatVector cOri01 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc);
+ FloatVector cOri11 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 1) * ldc);
+ FloatVector cOri21 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 1) * ldc);
+ FloatVector cOri31 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 1) * ldc);
+
+ cOri01 = alphaVec.fma(c01, cOri01);
+ cOri11 = alphaVec.fma(c11, cOri11);
+ cOri21 = alphaVec.fma(c21, cOri21);
+ cOri31 = alphaVec.fma(c31, cOri31);
+
+ cOri01.intoArray(sc, cOffset + cRow + (cCol + 1) * ldc);
+ cOri11.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc);
+ cOri21.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 1) * ldc);
+ cOri31.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 1) * ldc);
+
+ FloatVector cOri02 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc);
+ FloatVector cOri12 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 2) * ldc);
+ FloatVector cOri22 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 2) * ldc);
+ FloatVector cOri32 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 2) * ldc);
+
+ cOri02 = alphaVec.fma(c02, cOri02);
+ cOri12 = alphaVec.fma(c12, cOri12);
+ cOri22 = alphaVec.fma(c22, cOri22);
+ cOri32 = alphaVec.fma(c32, cOri32);
+
+ cOri02.intoArray(sc, cOffset + cRow + (cCol + 2) * ldc);
+ cOri12.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc);
+ cOri22.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 2) * ldc);
+ cOri32.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 2) * ldc);
+
+ FloatVector cOri03 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc);
+ FloatVector cOri13 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 3) * ldc);
+ FloatVector cOri23 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2
+ + (cCol + 3) * ldc);
+ FloatVector cOri33 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3
+ + (cCol + 3) * ldc);
+
+ cOri03 = alphaVec.fma(c03, cOri03);
+ cOri13 = alphaVec.fma(c13, cOri13);
+ cOri23 = alphaVec.fma(c23, cOri23);
+ cOri33 = alphaVec.fma(c33, cOri33);
+
+ cOri03.intoArray(sc, cOffset + cRow + (cCol + 3) * ldc);
+ cOri13.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc);
+ cOri23.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 3) * ldc);
+ cOri33.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 3) * ldc);
+
+ cRow += SGEMM_UNROLL_M;
+ }
+ countI = mc % SGEMM_UNROLL_M;
+ if (countI >= VECTOR_LENGTH2) {
+ int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc;
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ FloatVector c10 = FloatVector.zero(SSPECIES);
+ FloatVector c01 = FloatVector.zero(SSPECIES);
+ FloatVector c11 = FloatVector.zero(SSPECIES);
+ FloatVector c02 = FloatVector.zero(SSPECIES);
+ FloatVector c12 = FloatVector.zero(SSPECIES);
+ FloatVector c03 = FloatVector.zero(SSPECIES);
+ FloatVector c13 = FloatVector.zero(SSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+ FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH);
+
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+ FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]);
+ FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]);
+ FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]);
+
+ c00 = a0.fma(b0, c00);
+ c10 = a1.fma(b0, c10);
+ c01 = a0.fma(b1, c01);
+ c11 = a1.fma(b1, c11);
+
+ c02 = a0.fma(b2, c02);
+ c12 = a1.fma(b2, c12);
+ c03 = a0.fma(b3, c03);
+ c13 = a1.fma(b3, c13);
+
+ aIndx += VECTOR_LENGTH2;
+ bIndx += SGEMM_UNROLL_N;
+ }
+ alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc);
+
+ alphaVec.fma(c01, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 1) * ldc);
+ alphaVec.fma(c11, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 1) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc);
+
+ alphaVec.fma(c02, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 2) * ldc);
+ alphaVec.fma(c12, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 2) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc);
+
+ alphaVec.fma(c03, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 3) * ldc);
+ alphaVec.fma(c13, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH
+ + (cCol + 3) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc);
+
+ cRow += VECTOR_LENGTH2;
+ countI -= VECTOR_LENGTH2;
+ }
+ if (countI >= VECTOR_LENGTH) {
+ int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc;
+ FloatVector c00 = FloatVector.zero(SSPECIES);
+ FloatVector c01 = FloatVector.zero(SSPECIES);
+ FloatVector c02 = FloatVector.zero(SSPECIES);
+ FloatVector c03 = FloatVector.zero(SSPECIES);
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx);
+
+ FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]);
+ FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]);
+ FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]);
+ FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]);
+
+ c00 = a0.fma(b0, c00);
+ c01 = a0.fma(b1, c01);
+ c02 = a0.fma(b2, c02);
+ c03 = a0.fma(b3, c03);
+
+ aIndx += VECTOR_LENGTH;
+ bIndx += SGEMM_UNROLL_N;
+ }
+ alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc,
+ cOffset + cRow + cCol * ldc);
+ alphaVec.fma(c01, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 1) * ldc);
+ alphaVec.fma(c02, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 2) * ldc);
+ alphaVec.fma(c03, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(sc,
+ cOffset + cRow + (cCol + 3) * ldc);
+
+ cRow += VECTOR_LENGTH;
+ countI -= VECTOR_LENGTH;
+ }
+ while (countI > 0) {
+ int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc;
+ float[] cTmp = new float[SGEMM_UNROLL_N];
+ int countL = kc;
+ for (; countL > 0; countL--) {
+ cTmp[0] += sa[aIndx] * sb[bOffset + bIndx];
+ cTmp[1] += sa[aIndx] * sb[bOffset + bIndx + 1];
+ cTmp[2] += sa[aIndx] * sb[bOffset + bIndx + 2];
+ cTmp[3] += sa[aIndx] * sb[bOffset + bIndx + 3];
+ aIndx += 1;
+ bIndx += SGEMM_UNROLL_N;
+ }
+ sc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0];
+ sc[cOffset + cRow + (cCol + 1) * ldc] += alpha * cTmp[1];
+ sc[cOffset + cRow + (cCol + 2) * ldc] += alpha * cTmp[2];
+ sc[cOffset + cRow + (cCol + 3) * ldc] += alpha * cTmp[3];
+
+ cRow += 1;
+ countI -= 1;
+ }
+ cCol += SGEMM_UNROLL_N;
+ }
+ }
+
+ /**
+ * onCopy is used for normally packing matrix in the right.
+ */
+ protected static void onCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ float[] dst, int dstOffset) {
+ int col = 0;
+ int colPackSize = SGEMM_UNROLL_N;
+ int dstIndex = 0;
+ for (; col < sizeN - sizeN % colPackSize; col += colPackSize) {
+ int row = 0;
+ for (; row < sizeM; row += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 1] = src[(srcRow + row) + (srcCol + (col + 1)) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 2] = src[(srcRow + row) + (srcCol + (col + 2)) * srcLd + srcOffset];
+ dst[dstOffset + dstIndex + 3] = src[(srcRow + row) + (srcCol + (col + 3)) * srcLd + srcOffset];
+ dstIndex += colPackSize;
+ }
+ }
+ for (; col < sizeN; col += 1) {
+ int row = 0;
+ for (; row < sizeM; row += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dstIndex += 1;
+ }
+ }
+ }
+
+ /**
+ * itCopy is used for transpose packing matrix in the left.
+ */
+ protected static void itCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ float[] dst, int dstOffset) {
+ int row = 0;
+ int dstIndex = 0;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (row + vectorLen <= sizeM) {
+ int col = 0;
+ for (; col < sizeN; col++) {
+ System.arraycopy(src, srcOffset + (srcRow + row) + (srcCol + col) * srcLd, dst,
+ dstOffset + dstIndex, vectorLen);
+ dstIndex += vectorLen;
+ }
+ row += vectorLen;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java
new file mode 100644
index 0000000000000000000000000000000000000000..d134adaea0ee32b92597ea7f29f4589fe0d42087
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.singleprecision;
+
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_P;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_Q;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_R;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_UNROLL_N;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH2;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH4;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Sgemm {
+ public static void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset,
+ int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ BlasUtils.checkParameter("SGEMM", 1, Lsame.lsame(transa, "N") || Lsame.lsame(transa, "T"));
+ BlasUtils.checkParameter("SGEMM", 2, Lsame.lsame(transb, "N") || Lsame.lsame(transb, "T"));
+ boolean transaFlag = Lsame.lsame(transa, "N");
+ boolean transbFlag = Lsame.lsame(transb, "N");
+ BlasUtils.checkParameter("SGEMM", 3, m >= 0);
+ BlasUtils.checkParameter("SGEMM", 4, n >= 0);
+ BlasUtils.checkParameter("SGEMM", 5, k >= 0);
+ BlasUtils.checkParameter("SGEMM", 8, lda >= Math.max(1, (transaFlag ? m : k)));
+ BlasUtils.checkParameter("SGEMM", 10, ldb >= Math.max(1, (transbFlag ? k : n)));
+ BlasUtils.checkParameter("SGEMM", 13, ldc >= Math.max(1, m));
+
+ if (m == 0 || n == 0) {
+ return;
+ }
+ if (Float.compare(beta, 1.0f) != 0) {
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ SblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc);
+ }
+ if (BlasUtils.isZero(alpha) || k == 0) {
+ return;
+ }
+ BlasUtils.checkBlasArray("a", aOffset, ((transaFlag ? m : k) - 1) + ((transaFlag ? k : m) - 1) * lda, a.length);
+ BlasUtils.checkBlasArray("b", bOffset, ((transbFlag ? k : n) - 1) + ((transbFlag ? n : k) - 1) * ldb, b.length);
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ sgemmVector(transa, transb, m, n, k, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc);
+ }
+
+ private static void sgemmVector(String transa, String transb, int sizeM, int sizeN, int sizeK, float[] sa,
+ int aOffset, int lda, float alpha, float[] sb, int bOffset, int ldb, float[] sc, int cOffset, int ldc) {
+ int mc = Math.min(SGEMM_P, sizeM);
+ int nc = Math.min(SGEMM_R, sizeN);
+ int kc = Math.min(SGEMM_Q, sizeK);
+ float[] packa = new float[kc * mc];
+ float[] packb = new float[kc * nc];
+ for (int ns = 0; ns < sizeN; ns += nc) {
+ nc = Math.min(nc, sizeN - ns);
+ for (int ks = 0; ks < sizeK; ks += kc) {
+ kc = Math.min(kc, sizeK - ks);
+ if (Lsame.lsame(transb, "N")) {
+ SblasLevel3.onCopy(kc, nc, sb, ks, ns, bOffset, ldb, packb, 0); // packing matrix b
+ } else {
+ otCopy(nc, kc, sb, ns, ks, bOffset, ldb, packb, 0);
+ }
+ for (int ms = 0; ms < sizeM; ms += mc) {
+ mc = Math.min(mc, sizeM - ms);
+ if (Lsame.lsame(transa, "N")) {
+ SblasLevel3.itCopy(mc, kc, sa, ms, ks, aOffset, lda, packa, 0); // packing matrix a
+ } else {
+ inCopy(kc, mc, sa, ks, ms, aOffset, lda, packa, 0);
+ }
+ SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns);
+ }
+ }
+ }
+ }
+
+ /**
+ * otCopy method is used for transpose packing matrix in the right.
+ */
+ private static void otCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ float[] dst, int dstOffset) {
+ int row = 0;
+ int colPackSize = SGEMM_UNROLL_N;
+ int dstIndex = 0;
+ for (; row < sizeM - sizeM % colPackSize; row += colPackSize) {
+ int col = 0;
+ for (; col < sizeN; col += 1) {
+ System.arraycopy(src, (srcRow + row) + (srcCol + col) * srcLd + srcOffset, dst,
+ dstOffset + dstIndex, SGEMM_UNROLL_N);
+ dstIndex += colPackSize;
+ }
+ }
+ for (; row < sizeM; row += 1) {
+ int col = 0;
+ for (; col < sizeN; col += 1) {
+ dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset];
+ dstIndex += 1;
+ }
+ }
+ }
+
+ /**
+ * inCopy is used for normally packing matrix in the left.
+ */
+ private static void inCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd,
+ float[] dst, int dstOffset) {
+ int col = 0;
+ int dstIndex = 0;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (col + vectorLen <= sizeN) {
+ int row = 0;
+ for (; row < sizeM; row++) {
+ for (int count = 0; count < vectorLen; count++) {
+ dst[dstOffset + dstIndex + count] = src[srcOffset + (srcRow + row) + (srcCol + (col + count))
+ * srcLd];
+ }
+ dstIndex += vectorLen;
+ }
+ col += vectorLen;
+ }
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java
new file mode 100644
index 0000000000000000000000000000000000000000..507077e5d2fe1f2d9367e571765f148f55a6ab51
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java
@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.blas3.singleprecision;
+
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_P;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_Q;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_R;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_UNROLL_N;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH2;
+import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH4;
+
+import com.huawei.vectorblas.utils.BlasUtils;
+import com.huawei.vectorblas.utils.Lsame;
+
+public class Ssymm {
+ public static void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda,
+ float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) {
+ BlasUtils.checkParameter("SSYMM", 1, Lsame.lsame(side, "L") || Lsame.lsame(side, "R"));
+ BlasUtils.checkParameter("SSYMM", 2, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L"));
+ boolean sideFlag = Lsame.lsame(side, "L");
+ BlasUtils.checkParameter("SSYMM", 3, m >= 0);
+ BlasUtils.checkParameter("SSYMM", 4, n >= 0);
+ BlasUtils.checkParameter("SSYMM", 7, lda >= Math.max(1, (sideFlag ? m : n)));
+ BlasUtils.checkParameter("SSYMM", 9, ldb >= Math.max(1, m));
+ BlasUtils.checkParameter("SSYMM", 12, ldc >= Math.max(1, m));
+
+ if (m == 0 || n == 0) {
+ return;
+ }
+ if (Float.compare(beta, 1.0f) != 0) {
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ SblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc);
+ }
+ if (BlasUtils.isZero(alpha)) {
+ return;
+ }
+ BlasUtils.checkBlasArray("a", aOffset, ((sideFlag ? m : n) - 1) + ((sideFlag ? m : n) - 1) * lda, a.length);
+ BlasUtils.checkBlasArray("b", bOffset, (m - 1) + (n - 1) * ldb, b.length);
+ BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length);
+ ssymmVector(side, uplo, m, n, sideFlag ? m : n, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc);
+ }
+
+ private static void ssymmVector(String side, String uplo, int sizeM, int sizeN, int sizeK, float[] sa, int aOffset,
+ int lda, float alpha, float[] sb, int bOffset, int ldb, float[] sc, int cOffset, int ldc) {
+ int mc = Math.min(SGEMM_P, sizeM);
+ int nc = Math.min(SGEMM_R, sizeN);
+ int kc = Math.min(SGEMM_Q, sizeK);
+ boolean sideFlag = Lsame.lsame(side, "L");
+ float[] packa = new float[kc * (sideFlag ? mc : nc)];
+ float[] packb = new float[kc * (sideFlag ? nc : mc)];
+ for (int ns = 0; ns < sizeN; ns += nc) {
+ nc = Math.min(nc, sizeN - ns);
+ for (int ks = 0; ks < sizeK; ks += kc) {
+ kc = Math.min(kc, sizeK - ks);
+ if (Lsame.lsame(side, "L")) {
+ SblasLevel3.onCopy(kc, nc, sb, ks, ns, bOffset, ldb, packb, 0);
+ } else if (Lsame.lsame(side, "R") && Lsame.lsame(uplo, "U")) {
+ outCopy(kc, nc, sa, aOffset, lda, packa, 0, ns, ks);
+ } else {
+ oltCopy(kc, nc, sa, aOffset, lda, packa, 0, ns, ks);
+ }
+ for (int ms = 0; ms < sizeM; ms += mc) {
+ mc = Math.min(mc, sizeM - ms);
+ if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "U")) {
+ iutCopy(kc, mc, sa, aOffset, lda, packa, 0, ms, ks);
+ SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns);
+ } else if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "L")) {
+ iltCopy(kc, mc, sa, aOffset, lda, packa, 0, ms, ks);
+ SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns);
+ } else {
+ SblasLevel3.itCopy(mc, kc, sb, ms, ks, bOffset, ldb, packb, 0);
+ SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packb, packa, 0, sc, ldc, cOffset, ms, ns);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * oltCopy method is used for packing lower matrix in the right.
+ */
+ private static void oltCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLenthList = {SGEMM_UNROLL_N, 1};
+ for (int vectorLen : vectorLenthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posX + index + posY * srcLd;
+ } else {
+ offset[index] = posY + (posX + index) * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index] += srcLd;
+ } else {
+ offset[index]++;
+ }
+ }
+ delta--;
+ }
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * outCopy method is used for packing upper matrix in the right.
+ */
+ private static void outCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLenthList = {SGEMM_UNROLL_N, 1};
+ for (int vectorLen : vectorLenthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posY + (posX + index) * srcLd;
+ } else {
+ offset[index] = posX + index + posY * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index]++;
+ } else {
+ offset[index] += srcLd;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * iltCopy method is used for packing lower matrix in the left.
+ */
+ private static void iltCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posX + index + posY * srcLd;
+ } else {
+ offset[index] = posY + (posX + index) * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index] += srcLd;
+ } else {
+ offset[index]++;
+ }
+ }
+ delta--;
+ }
+
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+
+ /**
+ * iutCopy method is used for packing upper matrix in the left.
+ */
+ private static void iutCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst,
+ int dstOffset, int posX, int posY) {
+ int dstIndex = 0;
+ int countJ = sizeN;
+ int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1};
+ for (int vectorLen : vectorLengthList) {
+ while (countJ - vectorLen >= 0) {
+ int delta = posX - posY;
+ int[] offset = new int[vectorLen];
+ for (int index = 0; index < vectorLen; index++) {
+ if (delta > -index) {
+ offset[index] = posY + (posX + index) * srcLd;
+ } else {
+ offset[index] = posX + index + posY * srcLd;
+ }
+ }
+
+ int countI = sizeM;
+ for (; countI > 0; countI--) {
+ // read and write data
+ for (int index = 0; index < vectorLen; index++) {
+ dst[dstOffset + dstIndex] = src[srcOffset + offset[index]];
+ dstIndex += 1;
+ if (delta > -index) {
+ offset[index]++;
+ } else {
+ offset[index] += srcLd;
+ }
+ }
+ delta--;
+ }
+ posX += vectorLen;
+ countJ -= vectorLen;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java
new file mode 100644
index 0000000000000000000000000000000000000000..61aba91f6f839afec455ae2c13b9868a9b933da9
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.utils;
+
+import java.util.Random;
+
+public class ArrayUtil {
+ public static int loopBound(int length, int size) {
+ return roundDown(length, size);
+ }
+
+ private static int roundDown(int length, int size) {
+ if ((size & (size - 1)) == 0) {
+ // Size is zero or a power of two, so we got this.
+ return length & ~(size - 1);
+ } else {
+ return roundDownNPOT(length, size);
+ }
+ }
+
+ private static int roundDownNPOT(int length, int size) {
+ if (length >= 0) {
+ return length - (length % size);
+ } else {
+ return length - Math.floorMod(length, Math.abs(size));
+ }
+ }
+
+ private static final Random RANDOM = new Random(0);
+
+ public static double randomDouble() {
+ return RANDOM.nextDouble();
+ }
+
+ public static void randomDoubleArray(double[] arr) {
+ for (int i = 0; i < arr.length; i++) {
+ arr[i] = RANDOM.nextDouble() - 0.5d; // Produce double values between -0.5 and 0.5.
+ }
+ }
+
+ public static float randomFloat() {
+ return RANDOM.nextFloat();
+ }
+
+ public static void randomFloatArray(float[] arr) {
+ for (int i = 0; i < arr.length; i++) {
+ arr[i] = RANDOM.nextFloat() - 0.5f; // Produce float values between -0.5 and 0.5.
+ }
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..aa65bab61e687e87db01b11fad6fab23a93c450c
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Locale;
+import java.util.Random;
+
+public class BlasUtils {
+ private static final Logger LOG = LoggerFactory.getLogger(BlasUtils.class);
+ private static Random rand = new Random(0);
+
+ public static void checkParameter(String name, int index, boolean isValid) {
+ if (!isValid) {
+ String msg = String.format(Locale.ROOT,
+ "** On entry to %s parameter number %d had an illegal value", name, index);
+ throw new IllegalArgumentException(msg);
+ }
+ }
+
+ public static void checkBlasArray(String arrName, int offset, int index, int length) {
+ try {
+ checkBound(index + offset, length);
+ checkBound(offset, length);
+ } catch (ArrayIndexOutOfBoundsException e) {
+ throw new ArrayIndexOutOfBoundsException(
+ "Index " + index + " of array " + arrName + " out of bounds for length: " + length);
+ }
+ }
+
+ public static void checkBound(int index, int length) {
+ if (index < 0 || index >= length) {
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ }
+
+ public static boolean isZero(double val) {
+ return Double.compare(val, 0.0d) == 0 || Double.compare(val, -0.0d) == 0;
+ }
+
+ public static boolean isZero(float val) {
+ return Float.compare(val, 0.0f) == 0 || Float.compare(val, -0.0f) == 0;
+ }
+
+ /**
+ * Get double precision machine epsilon.
+ */
+ public static double getEpsd() {
+ double eps;
+ double half = 0.5d;
+ double maxVal;
+ double f1 = 0.5d;
+ do {
+ eps = f1;
+ f1 *= half;
+ maxVal = 1.0d + f1;
+ } while (Double.compare(maxVal, 1.0d) != 0);
+ return eps;
+ }
+
+ /**
+ * Get single precision machine epsilon.
+ */
+ public static float getEpsf() {
+ float eps;
+ float half = 0.5f;
+ float maxVal;
+ float f1 = 0.5f;
+ do {
+ eps = f1;
+ f1 *= half;
+ maxVal = 1.0f + f1;
+ } while (Float.compare(maxVal, 1.0f) != 0);
+ return eps;
+ }
+
+ /**
+ * Generate general matrix of double precision.
+ */
+ public static void gegen(int sizeM, int sizeN, double[] da, int aOffset, int lda) {
+ try {
+ int idx = (lda >= 0 ? 0 : (sizeN - 1) * -lda) + aOffset;
+ for (int j = 0; j < sizeN; j++) {
+ for (int i = 0; i < sizeM; i++) {
+ da[idx + i] = rand.nextDouble() - 0.5d;
+ }
+ idx += lda;
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ }
+
+ /**
+ * Generate general matrix of single precision.
+ */
+ public static void gegen(int sizeM, int sizeN, float[] sa, int aOffset, int lda) {
+ try {
+ int idx = (lda >= 0 ? 0 : (sizeN - 1) * -lda) + aOffset;
+ for (int j = 0; j < sizeN; j++) {
+ for (int i = 0; i < sizeM; i++) {
+ sa[idx + i] = rand.nextFloat() - 0.5f;
+ }
+ idx += lda;
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ }
+
+ /**
+ * Calculates the infinity norm of single precision vector.
+ */
+ public static float getInfnrm(int sizeN, float[] sx, int xOffset, int incX) {
+ int idx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset;
+ float max = 0.0f;
+ try {
+ for (int i = 0; i < sizeN; i++, idx += incX) {
+ max = Math.max(Math.abs(sx[idx]), max);
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the infinity norm of double precision vector.
+ */
+ public static double getInfnrm(int sizeN, double[] dx, int xOffset, int incX) {
+ int idx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset;
+ double max = 0.0d;
+ try {
+ for (int i = 0; i < sizeN; i++, idx += incX) {
+ max = Math.max(Math.abs(dx[idx]), max);
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the difference between two double precision vectors.
+ */
+ public static void getVdiff(int sizeN, double[] dx, int xOffset, int incX, double[] dy, int yOffset, int incY,
+ double[] dz, int zOffset, int incZ) {
+ int xIdx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset;
+ int yIdx = (incY >= 0 ? 0 : (sizeN - 1) * -incY) + yOffset;
+ int zIdx = (incZ >= 0 ? 0 : (sizeN - 1) * -incZ) + zOffset;
+ for (int i = sizeN; i > 0; i--, xIdx += incX, yIdx += incY, zIdx += incZ) {
+ dz[zIdx] = dx[xIdx] - dy[yIdx];
+ }
+ }
+
+ /**
+ * Calculates the difference between two single precision vectors.
+ */
+ public static void getVdiff(int sizeN, float[] sx, int xOffset, int incX, float[] sy, int yOffset, int incY,
+ float[] sz, int zOffset, int incZ) {
+ int xIdx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset;
+ int yIdx = (incY >= 0 ? 0 : (sizeN - 1) * -incY) + yOffset;
+ int zIdx = (incZ >= 0 ? 0 : (sizeN - 1) * -incZ) + zOffset;
+ for (int i = sizeN; i > 0; i--, xIdx += incX, yIdx += incY, zIdx += incZ) {
+ sz[zIdx] = sx[xIdx] - sy[yIdx];
+ }
+ }
+
+ /**
+ * Calculates the 1-norm of a general rectangular matrix of double precision.
+ */
+ public static double getGenrm1(int sizeM, int sizeN, double[] da, int aOffset, int lda) {
+ double max = 0.0d;
+ int offset = aOffset;
+ try {
+ for (int j = 0; j < sizeN; j++) {
+ double t0 = org.netlib.blas.Dasum.dasum(sizeM, da, offset, 1);
+ max = Math.max(t0, max);
+ offset += lda;
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the 1-norm of a general rectangular matrix of single precision.
+ */
+ public static float getGenrm1(int sizeM, int sizeN, float[] sa, int aOffset, int lda) {
+ float max = 0.0f;
+ int offset = aOffset;
+ try {
+ for (int j = 0; j < sizeN; j++) {
+ float t0 = org.netlib.blas.Sasum.sasum(sizeM, sa, offset, 1);
+ max = Math.max(t0, max);
+ offset += lda;
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the 1-norm of (A-B) matrix of double precision.
+ */
+ public static double getGediffnrm1(int sizeM, int sizeN, double[] da, int aOffset, int lda,
+ double[] db, int bOffset, int ldb) {
+ double max = 0.0d;
+ int offset1 = aOffset;
+ int offset2 = bOffset;
+ for (int j = 0; j < sizeN; j++) {
+ double t0 = 0.0d;
+ for (int i = 0; i < sizeM; i++) {
+ t0 += Math.abs(da[offset1] - db[offset2]);
+ }
+ max = Math.max(t0, max);
+ offset1 += lda;
+ offset2 += ldb;
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the 1-norm of (A-B) matrix of single precision.
+ */
+ public static float getGediffnrm1(int sizeM, int sizeN, float[] sa, int aOffset, int lda,
+ float[] sb, int bOffset, int ldb) {
+ float max = 0.0f;
+ int offset1 = aOffset;
+ int offset2 = bOffset;
+ for (int j = 0; j < sizeN; j++) {
+ float t0 = 0.0f;
+ for (int i = 0; i < sizeM; i++) {
+ t0 += Math.abs(sa[offset1] - sb[offset2]);
+ }
+ max = Math.max(t0, max);
+ offset1 += lda;
+ offset2 += ldb;
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the norm of a double precision symmetric packed matrix.
+ */
+ public static double getSpnrm(String uplo, int sizeN, double[] da, int aOffset) {
+ if (sizeN <= 0) {
+ return 0.0d;
+ }
+ double[] work = new double[sizeN];
+ try {
+ if (uplo.equalsIgnoreCase("U")) {
+ for (int j = 0, iaij = 0; j < sizeN; j++) {
+ double t0 = 0.0d;
+ for (int i = 0; i < j; i++, iaij++) {
+ work[i] += Math.abs(da[iaij + aOffset]);
+ t0 += Math.abs(da[iaij + aOffset]);
+ }
+ work[j] += Math.abs(da[iaij + aOffset]) + t0;
+ iaij++;
+ }
+ } else {
+ for (int j = 0, iaij = 0; j < sizeN; j++) {
+ double t0 = 0.0d;
+ work[j] = Math.abs(da[iaij + aOffset]);
+ iaij++;
+ for (int i = j + 1; i < sizeN; i++, iaij++) {
+ work[i] += Math.abs(da[iaij + aOffset]);
+ t0 += Math.abs(da[iaij + aOffset]);
+ }
+ work[j] += t0;
+ }
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ double max = work[0];
+ for (int j = 1; j < sizeN; j++) {
+ max = Math.max(max, work[j]);
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the norm of a single precision symmetric packed matrix.
+ */
+ public static float getSpnrm(String uplo, int sizeN, float[] sa, int aOffset) {
+ if (sizeN <= 0) {
+ return 0.0f;
+ }
+ float[] work = new float[sizeN];
+ try {
+ if (uplo.equalsIgnoreCase("U")) {
+ for (int j = 0, iaij = 0; j < sizeN; j++) {
+ float t0 = 0.0f;
+ for (int i = 0; i < j; i++, iaij++) {
+ work[i] += Math.abs(sa[iaij + aOffset]);
+ t0 += Math.abs(sa[iaij + aOffset]);
+ }
+ work[j] += Math.abs(sa[iaij + aOffset]) + t0;
+ iaij++;
+ }
+ } else {
+ for (int j = 0, iaij = 0; j < sizeN; j++) {
+ float t0 = 0.0f;
+ work[j] = Math.abs(sa[iaij + aOffset]);
+ iaij++;
+ for (int i = j + 1; i < sizeN; i++, iaij++) {
+ work[i] += Math.abs(sa[iaij + aOffset]);
+ t0 += Math.abs(sa[iaij + aOffset]);
+ }
+ work[j] += t0;
+ }
+ }
+ } catch (ArrayIndexOutOfBoundsException e) {
+ LOG.error(e.toString());
+ }
+ float max = work[0];
+ for (int j = 1; j < sizeN; j++) {
+ max = Math.max(max, work[j]);
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the norm of a upper or lower triangular part of the double precision symmetric matrix.
+ */
+ public static double getSynrm(String uplo, int sizeN, double[] da, int aOffset, int lda) {
+ int ldap12 = lda + 1;
+ if (sizeN <= 0) {
+ return 0.0d;
+ }
+ double[] work = new double[sizeN];
+ if (uplo.equalsIgnoreCase("U")) {
+ for (int j = 0, jaj = 0; j < sizeN; j++, jaj += lda) {
+ double t0 = 0.0d;
+ int iaij = jaj;
+ for (int i = 0; i < j; i++, iaij++) {
+ work[i] += Math.abs(da[iaij + aOffset]);
+ t0 += Math.abs(da[iaij + aOffset]);
+ }
+ work[j] += Math.abs(da[iaij + aOffset]) + t0;
+ }
+ } else {
+ for (int j = 0, jaj = 0; j < sizeN; j++, jaj += ldap12) {
+ double t0 = 0.0d;
+ work[j] = Math.abs(da[jaj + aOffset]);
+ for (int i = j + 1, iaij = jaj + 1; i < sizeN; i++, iaij++) {
+ work[i] += Math.abs(da[iaij + aOffset]);
+ t0 += Math.abs(da[iaij + aOffset]);
+ }
+ work[j] += t0;
+ }
+ }
+ double max = work[0];
+ for (int j = 1; j < sizeN; j++) {
+ max = Math.max(work[j], max);
+ }
+ return max;
+ }
+
+ /**
+ * Calculates the norm of a upper or lower triangular part of the single precision symmetric matrix.
+ */
+ public static float getSynrm(String uplo, int sizeN, float[] sa, int aOffset, int lda) {
+ int ldap12 = lda + 1;
+ if (sizeN <= 0) {
+ return 0.0f;
+ }
+ float[] work = new float[sizeN];
+ if (uplo.equalsIgnoreCase("U")) {
+ for (int j = 0, jaj = 0; j < sizeN; j++, jaj += lda) {
+ float t0 = 0.0f;
+ int iaij = jaj;
+ for (int i = 0; i < j; i++, iaij++) {
+ work[i] += Math.abs(sa[iaij + aOffset]);
+ t0 += Math.abs(sa[iaij + aOffset]);
+ }
+ work[j] += Math.abs(sa[iaij + aOffset]) + t0;
+ }
+ } else {
+ for (int j = 0, jaj = 0; j < sizeN; j++, jaj += ldap12) {
+ float t0 = 0.0f;
+ work[j] = Math.abs(sa[jaj + aOffset]);
+ for (int i = j + 1, iaij = jaj + 1; i < sizeN; i++, iaij++) {
+ work[i] += Math.abs(sa[iaij + aOffset]);
+ t0 += Math.abs(sa[iaij + aOffset]);
+ }
+ work[j] += t0;
+ }
+ }
+ float max = work[0];
+ for (int j = 1; j < sizeN; j++) {
+ max = Math.max(work[j], max);
+ }
+ return max;
+ }
+}
diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java
new file mode 100644
index 0000000000000000000000000000000000000000..8da45426d0a1da0acec22a733c7c40eaaa7a7278
--- /dev/null
+++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2023. Huawei Technologies Co., Ltd.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.huawei.vectorblas.utils;
+
+public final class Lsame {
+ /**
+ * LSAME returns .TRUE. if CA is the same letter as CB regardless of case.
+ *
+ * @param cA character a
+ * @param cB character b
+ * @return true or false
+ */
+ public static boolean lsame(String cA, String cB) {
+ return cA != null && cA.regionMatches(true, 0, cB, 0, cA.length());
+ }
+}
\ No newline at end of file