diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..23d4bb35443429c9ab12932e3ecc6502ac8f4863 --- /dev/null +++ b/LICENSE @@ -0,0 +1,50 @@ +vectorBlas is licensed under the Apache License. + +Copyright (C) 2023. Huawei Technologies Co., Ltd. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +----------------------------------------------------------------------------- + +This product also contains code from third parties, under the following licenses: + +f2jblas +------- + +Copyright © 2022 The University of Tennessee. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: +· Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +· Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer listed in this license in the documentation + and/or other materials provided with the distribution. +· Neither the name of the copyright holders nor the names of its contributors may be used to endorse + or promote products derived from this software without specific prior written permission. + +This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, +including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose +are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, +incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of +substitute goods or services; loss of use, data, or profits; or business interruption) however caused and +on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) +arising in any way out of the use of this software, even if advised of the possibility of such damage. + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..8d54c1b2ca4b3b3682db043028a7c0cd4c405cd1 --- /dev/null +++ b/pom.xml @@ -0,0 +1,35 @@ + + + 4.0.0 + + com.huawei.vector + parent + 1.0 + pom + + + vectorBlas + + + + 8 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.0 + + UTF-8 + ${java.version} + ${java.version} + ${java.version} + + + + + diff --git a/vectorBlas/pom.xml b/vectorBlas/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..9d1da1be401b26bdce2ed3b852d1d7ee5a8d3c94 --- /dev/null +++ b/vectorBlas/pom.xml @@ -0,0 +1,85 @@ + + + 4.0.0 + + + com.huawei.vector + parent + 1.0 + ../pom.xml + + + vectorBlas + 1.0 + jar + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M7 + + @{argLine} --add-modules=jdk.incubator.vector + + + + org.jacoco + jacoco-maven-plugin + 0.8.8 + + + + prepare-agent + + + + default-report + test + + report + + + + + + + + + + org.junit.jupiter + junit-jupiter-engine + 5.9.1 + test + + + org.junit.vintage + junit-vintage-engine + 5.9.1 + test + + + net.sourceforge.f2j + arpack_combined_all + 0.1 + compile + + + org.jacoco + jacoco-maven-plugin + 0.8.8 + + + org.slf4j + slf4j-api + 2.0.4 + + + org.slf4j + slf4j-simple + 2.0.4 + + + diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java new file mode 100644 index 0000000000000000000000000000000000000000..adac74745ad00d58437b0737895ea26785f11623 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/BLAS.java @@ -0,0 +1,346 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas; + +public interface BLAS { + // BLAS 1 + double dasum(int n, double[] x, int incx); + + double dasum(int n, double[] x, int xOffset, int incx); + + float sasum(int n, float[] x, int incx); + + float sasum(int n, float[] x, int xOffset, int incx); + + void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy); + + void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy); + + void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy); + + void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy); + + void dcopy(int n, double[] x, int incx, double[] y, int incy); + + void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy); + + void scopy(int n, float[] x, int incx, float[] y, int incy); + + void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy); + + double ddot(int n, double[] x, int incx, double[] y, int incy); + + double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy); + + float sdot(int n, float[] x, int incx, float[] y, int incy); + + float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy); + + float snrm2(int n, float[] x, int incx); + + float snrm2(int n, float[] x, int xOffset, int incx); + + double dnrm2(int n, double[] x, int incx); + + double dnrm2(int n, double[] x, int xOffset, int incx); + + void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s); + + void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s); + + void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s); + + void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s); + + void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param); + + void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param, + int paramOffset); + + void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param); + + void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param, + int paramOffset); + + void sscal(int n, float alp, float[] x, int incx); + + void sscal(int n, float alp, float[] x, int xOffset, int incx); + + void dscal(int n, double alp, double[] x, int incx); + + void dscal(int n, double alp, double[] x, int xOffset, int incx); + + void sswap(int n, float[] x, int incx, float[] y, int incy); + + void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy); + + void dswap(int n, double[] x, int incx, double[] y, int incy); + + void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy); + + int isamax(int n, float[] x, int incx); + + int isamax(int n, float[] x, int xOffset, int incx); + + int idamax(int n, double[] x, int incx); + + int idamax(int n, double[] x, int xOffset, int incx); + + // BLAS 2 + void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int lda, double[] x, + int incx, double beta, double[] y, int incy); + + void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int aOffset, + int lda, double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy); + + void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int lda, float[] x, + int incx, float beta, float[] y, int incy); + + void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx, float beta, float[] y, int yOffset, int incy); + + void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x, + int incx, double beta, double[] y, int incy); + + void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy); + + void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x, + int incx, float beta, float[] y, int incy); + + void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy); + + void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda); + + void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, + int incy, double[] a, int aOffset, int lda); + + void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda); + + void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda); + + void dsbmv(String uplo, int n, int k, double alpha, double[] a, int lda, double[] x, int incx, + double beta, double[] y, int incy); + + void dsbmv(String uplo, int n, int k, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy); + + void ssbmv(String uplo, int n, int k, float alpha, float[] a, int lda, float[] x, int incx, + float beta, float[] y, int incy); + + void ssbmv(String uplo, int n, int k, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy); + + void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta, double[] y, int incy); + + void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, + int incx, double beta, double[] y, int yOffset, int incy); + + void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta, float[] y, int incy); + + void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, + int incx, float beta, float[] y, int yOffset, int incy); + + void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap); + + void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset); + + void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap); + + void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset); + + void dspr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a); + + void dspr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, + int yOffset, int incy, double[] a, int aOffset); + + void sspr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a); + + void sspr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, + int yOffset, int incy, float[] a, int aOffset); + + void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy); + + void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy); + + void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy); + + void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + int incx, float beta, float[] y, int yOffset, int incy); + + void dsyr(String uplo, int n, double alpha, double[] x, int incx, double[] a, int lda); + + void dsyr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] a, int aOffset, int lda); + + void ssyr(String uplo, int n, float alpha, float[] x, int incx, float[] a, int lda); + + void ssyr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] a, int aOffset, int lda); + + void dsyr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda); + + void dsyr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, + int yOffset, int incy, double[] a, int aOffset, int lda); + + void ssyr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda); + + void ssyr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, + int yOffset, int incy, float[] a, int aOffset, int lda); + + void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx); + + void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx); + + void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx); + + void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx); + + void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx); + + void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx); + + void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx); + + void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx); + + void dtpmv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx); + + void dtpmv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x, + int xOffset, int incx); + + void stpmv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx); + + void stpmv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x, + int xOffset, int incx); + + void dtpsv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx); + + void dtpsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x, + int xOffset, int incx); + + void stpsv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx); + + void stpsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x, + int xOffset, int incx); + + void dtrmv(String uplo, String trans, String diag, int n, double[] a, int lda, double[] x, int incx); + + void dtrmv(String uplo, String trans, String diag, int n, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx); + + void strmv(String uplo, String trans, String diag, int n, float[] a, int lda, float[] x, int incx); + + void strmv(String uplo, String trans, String diag, int n, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx); + + void dtrsv(String uplo, String transa, String diag, int n, double[] a, int lda, double[] x, int incx); + + void dtrsv(String uplo, String transa, String diag, int n, double[] a, + int aOffset, int lda, double[] x, int xOffset, int incx); + + void strsv(String uplo, String transa, String diag, int n, float[] a, int lda, float[] x, int incx); + + void strsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx); + + // BLAS 3 + void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc); + + void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset, + int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc); + + void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, + int lda, float[] b, int ldb, float beta, float[] c, int ldc); + + void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset, + int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc); + + void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc); + + void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc); + + void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc); + + void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc); + + void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc); + + void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc); + + void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc); + + void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc); + + void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int lda, + double beta, double[] c, int ldc); + + void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda, + double beta, double[] c, int cOffset, int ldc); + + void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int lda, + float beta, float[] c, int ldc); + + void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda, + float beta, float[] c, int cOffset, int ldc); + + void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int lda, double[] b, int ldb); + + void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int aOffset, int lda, double[] b, int bOffset, int ldb); + + void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int lda, float[] b, int ldb); + + void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int aOffset, int lda, float[] b, int bOffset, int ldb); + + void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int lda, double[] b, int ldb); + + void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int aOffset, int lda, double[] b, int bOffset, int ldb); + + void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int lda, float[] b, int ldb); + + void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int aOffset, int lda, float[] b, int bOffset, int ldb); +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java new file mode 100644 index 0000000000000000000000000000000000000000..8d4de111a559ea60d89792625dcf2fc5c31a3992 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/F2jBLAS.java @@ -0,0 +1,1082 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas; + +public class F2jBLAS implements BLAS { + /** + * F2jblas dasum fixed version, use long to store (n * incx) to avoid int overflow. + */ + @Override + public double dasum(int n, double[] x, int xOffset, int incx) { + int unrollSize = 6; + double dasum = 0.0D; + if (n <= 0 || incx <= 0) { + return dasum; + } else { + int index; + if (incx == 1) { + int restm = n % unrollSize; + if (restm != 0) { + index = 1; + for (int i = restm; i > 0; --i) { + dasum += Math.abs(x[index - 1 + xOffset]); + ++index; + } + if (n < unrollSize) { + return dasum; + } + } + int mp1 = restm + 1; + index = mp1; + + for (int i = (n - mp1 + unrollSize) / unrollSize; i > 0; --i) { + dasum = dasum + Math.abs(x[index - 1 + xOffset]) + Math.abs(x[index + xOffset]) + + Math.abs(x[index + 1 + xOffset]) + Math.abs(x[index + 2 + xOffset]) + + Math.abs(x[index + 3 + xOffset]) + Math.abs(x[index + 4 + xOffset]); + index += unrollSize; + } + return dasum; + } else { + long nIncx = (long) n * incx; + index = 1; + for (long i = (nIncx - 1 + incx) / incx; i > 0; --i) { + dasum += Math.abs(x[index - 1 + xOffset]); + index += incx; + } + return dasum; + } + } + } + + /** + * dasum without offset + */ + @Override + public double dasum(int n, double[] x, int incx) { + return dasum(n, x, 0, incx); + } + + + @Override + public float sasum(int n, float[] x, int incx) { + return sasum(n, x, 0, incx); + } + + /** + * F2jblas sasum fixed version, use long to store (n * incx) to avoid int overflow. + */ + @Override + public float sasum(int n, float[] x, int xOffset, int incx) { + int unrollSize = 6; + float sasum = 0.0F; + if (n <= 0 || incx <= 0) { + return sasum; + } else { + int index; + if (incx == 1) { + int restm = n % unrollSize; + if (restm != 0) { + index = 1; + for (int i = restm; i > 0; --i) { + sasum += Math.abs(x[index - 1 + xOffset]); + ++index; + } + if (n < unrollSize) { + return sasum; + } + } + int mp1 = restm + 1; + index = mp1; + + for (int i = (n - mp1 + unrollSize) / unrollSize; i > 0; --i) { + sasum = sasum + Math.abs(x[index - 1 + xOffset]) + Math.abs(x[index + xOffset]) + + Math.abs(x[index + 1 + xOffset]) + Math.abs(x[index + 2 + xOffset]) + + Math.abs(x[index + 3 + xOffset]) + Math.abs(x[index + 4 + xOffset]); + index += unrollSize; + } + return sasum; + } else { + long nIncx = (long) n * incx; + index = 1; + for (long i = (nIncx - 1 + incx) / incx; i > 0; --i) { + sasum += Math.abs(x[index - 1 + xOffset]); + index += incx; + } + return sasum; + } + } + } + + @Override + public void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy) { + org.netlib.blas.Daxpy.daxpy(n, alpha, x, 0, incx, y, 0, incy); + } + + @Override + public void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + org.netlib.blas.Daxpy.daxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy) { + org.netlib.blas.Saxpy.saxpy(n, alpha, x, 0, incx, y, 0, incy); + } + + @Override + public void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + org.netlib.blas.Saxpy.saxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void dcopy(int n, double[] x, int incx, double[] y, int incy) { + org.netlib.blas.Dcopy.dcopy(n, x, 0, incx, y, 0, incy); + } + + @Override + public void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + org.netlib.blas.Dcopy.dcopy(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void scopy(int n, float[] x, int incx, float[] y, int incy) { + org.netlib.blas.Scopy.scopy(n, x, 0, incx, y, 0, incy); + } + + @Override + public void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + org.netlib.blas.Scopy.scopy(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public double ddot(int n, double[] x, int incx, double[] y, int incy) { + return org.netlib.blas.Ddot.ddot(n, x, 0, incx, y, 0, incy); + } + + @Override + public double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + return org.netlib.blas.Ddot.ddot(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public float sdot(int n, float[] x, int incx, float[] y, int incy) { + return org.netlib.blas.Sdot.sdot(n, x, 0, incx, y, 0, incy); + } + + @Override + public float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + return org.netlib.blas.Sdot.sdot(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public double dnrm2(int n, double[] x, int incx) { + return dnrm2(n, x, 0, incx); + } + + /** + * F2jblas dnrm2 fixed version, use long to store (n * incx) to avoid int overflow. + */ + @Override + public double dnrm2(int n, double[] x, int xOffset, int incx) { + double absxi = 0.0D; + double norm = 0.0D; + double scale = 0.0D; + double ssq = 0.0D; + if (n < 1 || incx < 1) { + norm = 0.0; + } else if (n == 1) { + norm = Math.abs(x[xOffset]); + } else { + scale = 0.0; + ssq = 1.0D; + int ix = 1; + for (long i = ((long) n * incx) / incx; i > 0; --i) { + if (x[ix - 1 + xOffset] != 0.0) { + absxi = Math.abs(x[ix - 1 + xOffset]); + if (scale < absxi) { + ssq = 1.0D + ssq * Math.pow(scale / absxi, (double) 2); + scale = absxi; + } else { + ssq += Math.pow(absxi / scale, (double) 2); + } + } + ix += incx; + } + norm = scale * Math.sqrt(ssq); + } + return norm; + } + + @Override + public float snrm2(int n, float[] x, int incx) { + return snrm2(n, x, 0, incx); + } + + /** + * F2jblas snrm2 fixed version, use long to store (n * incx) to avoid int overflow. + */ + @Override + public float snrm2(int n, float[] x, int xOffset, int incx) { + float absxi = 0.0F; + float norm = 0.0F; + float scale = 0.0F; + float ssq = 0.0F; + if (n < 1 || incx < 1) { + norm = 0.0F; + } else if (n == 1) { + norm = Math.abs(x[xOffset]); + } else { + scale = 0.0F; + ssq = 1.0F; + int ix = 1; + for (long i = ((long) n * incx) / incx; i > 0; --i) { + if (x[ix - 1 + xOffset] != 0.0F) { + absxi = Math.abs(x[ix - 1 + xOffset]); + if (scale < absxi) { + ssq = 1.0F + ssq * (float) Math.pow(scale / absxi, 2); + scale = absxi; + } else { + ssq += Math.pow(absxi / scale, 2); + } + } + ix += incx; + } + norm = scale * (float) Math.sqrt(ssq); + } + return norm; + } + + @Override + public void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s) { + org.netlib.blas.Srot.srot(n, x, 0, incx, y, 0, incy, c, s); + } + + @Override + public void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s) { + org.netlib.blas.Srot.srot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + + @Override + public void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s) { + org.netlib.blas.Drot.drot(n, x, 0, incx, y, 0, incy, c, s); + } + + @Override + public void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s) { + org.netlib.blas.Drot.drot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + + @Override + public void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param) { + srotm(n, x, 0, incx, y, 0, incy, param, 0); + } + + /** + * f2jblas srotm fixed version, use long to store (n * incx) to avoid int overflow + */ + @Override + public void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param, + int paramOffset) { + float flag = 0.0F; + float h11 = 0.0F; + float h12 = 0.0F; + float h21 = 0.0F; + float h22 = 0.0F; + float wi = 0.0F; + float zi = 0.0F; + flag = param[paramOffset]; + if (n > 0 && Float.compare(flag, -2.0F) != 0) { // If flag equals -2.0, do nothing and return directly. + int index; + if ((incx == incy && incx > 0) ^ true) { + int xIndex = 1; + int yIndex = 1; + if (incx < 0) { + xIndex = 1 + (1 - n) * incx; + } + if (incy < 0) { + yIndex = 1 + (1 - n) * incy; + } + if (flag < 0.0) { + h11 = param[2 - 1 + paramOffset]; + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi * h11 + zi * h12; + y[yIndex - 1 + yOffset] = wi * h21 + zi * h22; + xIndex += incx; + yIndex += incy; + ++index; + } + } else if (flag == 0.0) { + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + index = 1; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi + zi * h12; + y[yIndex - 1 + yOffset] = wi * h21 + zi; + xIndex += incx; + yIndex += incy; + ++index; + } + } else { + h11 = param[2 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi * h11 + zi; + y[yIndex - 1 + yOffset] = -wi + h22 * zi; + xIndex += incx; + yIndex += incy; + } + } + } else { + long nSteps = (long) n * incx; + if (flag < 0.0) { + h11 = param[2 - 1 + paramOffset]; + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi * h11 + zi * h12; + y[index - 1 + yOffset] = wi * h21 + zi * h22; + index += incx; + } + } else if (flag == 0.0) { + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi + zi * h12; + y[index - 1 + yOffset] = wi * h21 + zi; + index += incx; + } + } else { + h11 = param[2 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi * h11 + zi; + y[index - 1 + yOffset] = -wi + h22 * zi; + index += incx; + } + } + } + } + } + + @Override + public void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param) { + drotm(n, x, 0, incx, y, 0, incy, param, 0); + } + + /** + * F2jblas drotm fixed version, use long to store (n * incx) to avoid int overflow. + */ + @Override + public void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param, + int paramOffset) { + double flag = 0.0D; + double h11 = 0.0D; + double h12 = 0.0D; + double h21 = 0.0D; + double h22 = 0.0D; + double wi = 0.0D; + double zi = 0.0D; + flag = param[paramOffset]; + if (n > 0 && Double.compare(flag, -2.0D) != 0) { // If flag equals -2.0, do nothing and return directly. + int index; + if ((incx == incy && incx > 0) ^ true) { + int xIndex = 1; + int yIndex = 1; + if (incx < 0) { + xIndex = 1 + (1 - n) * incx; + } + if (incy < 0) { + yIndex = 1 + (1 - n) * incy; + } + if (flag < 0.0) { + h11 = param[2 - 1 + paramOffset]; + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi * h11 + zi * h12; + y[yIndex - 1 + yOffset] = wi * h21 + zi * h22; + xIndex += incx; + yIndex += incy; + ++index; + } + } else if (flag == 0.0) { + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + index = 1; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi + zi * h12; + y[yIndex - 1 + yOffset] = wi * h21 + zi; + xIndex += incx; + yIndex += incy; + ++index; + } + } else { + h11 = param[2 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + for (int i = n; i > 0; --i) { + wi = x[xIndex - 1 + xOffset]; + zi = y[yIndex - 1 + yOffset]; + x[xIndex - 1 + xOffset] = wi * h11 + zi; + y[yIndex - 1 + yOffset] = -wi + h22 * zi; + xIndex += incx; + yIndex += incy; + } + } + } else { + long nSteps = (long) n * incx; + if (flag < 0.0) { + h11 = param[2 - 1 + paramOffset]; + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi * h11 + zi * h12; + y[index - 1 + yOffset] = wi * h21 + zi * h22; + index += incx; + } + } else if (flag == 0.0) { + h12 = param[4 - 1 + paramOffset]; + h21 = param[3 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi + zi * h12; + y[index - 1 + yOffset] = wi * h21 + zi; + index += incx; + } + } else { + h11 = param[2 - 1 + paramOffset]; + h22 = param[5 - 1 + paramOffset]; + index = 1; + for (long i = (nSteps - 1 + incx) / incx; i > 0; --i) { + wi = x[index - 1 + xOffset]; + zi = y[index - 1 + yOffset]; + x[index - 1 + xOffset] = wi * h11 + zi; + y[index - 1 + yOffset] = -wi + h22 * zi; + index += incx; + } + } + } + } + } + + @Override + public void sscal(int n, float alp, float[] x, int incx) { + org.netlib.blas.Sscal.sscal(n, alp, x, 0, incx); + } + + @Override + public void sscal(int n, float alp, float[] x, int xOffset, int incx) { + org.netlib.blas.Sscal.sscal(n, alp, x, xOffset, incx); + } + + @Override + public void dscal(int n, double alp, double[] x, int incx) { + org.netlib.blas.Dscal.dscal(n, alp, x, 0, incx); + } + + @Override + public void dscal(int n, double alp, double[] x, int xOffset, int incx) { + org.netlib.blas.Dscal.dscal(n, alp, x, xOffset, incx); + } + + @Override + public void sswap(int n, float[] x, int incx, float[] y, int incy) { + org.netlib.blas.Sswap.sswap(n, x, 0, incx, y, 0, incy); + } + + @Override + public void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + org.netlib.blas.Sswap.sswap(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void dswap(int n, double[] x, int incx, double[] y, int incy) { + org.netlib.blas.Dswap.dswap(n, x, 0, incx, y, 0, incy); + } + + @Override + public void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + org.netlib.blas.Dswap.dswap(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public int isamax(int n, float[] x, int incx) { + return org.netlib.blas.Isamax.isamax(n, x, 0, incx); + } + + @Override + public int isamax(int n, float[] x, int xOffset, int incx) { + return org.netlib.blas.Isamax.isamax(n, x, xOffset, incx); + } + + @Override + public int idamax(int n, double[] x, int incx) { + return org.netlib.blas.Idamax.idamax(n, x, 0, incx); + } + + @Override + public int idamax(int n, double[] x, int xOffset, int incx) { + return org.netlib.blas.Idamax.idamax(n, x, xOffset, incx); + } + + @Override + public void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int lda, + double[] x, int incx, double beta, double[] y, int incy) { + org.netlib.blas.Dgbmv.dgbmv(trans, m, n, kl, ku, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dgbmv(String trans, int m, int n, int kl, int ku, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + org.netlib.blas.Dgbmv.dgbmv( + trans, m, n, kl, ku, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int lda, float[] x, int incx, + float beta, float[] y, int incy) { + org.netlib.blas.Sgbmv.sgbmv(trans, m, n, kl, ku, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void sgbmv(String trans, int m, int n, int kl, int ku, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx, float beta, float[] y, int yOffset, int incy) { + org.netlib.blas.Sgbmv.sgbmv( + trans, m, n, kl, ku, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy) { + org.netlib.blas.Dgemv.dgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + org.netlib.blas.Dgemv.dgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy) { + org.netlib.blas.Sgemv.sgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy) { + org.netlib.blas.Sgemv.sgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda) { + org.netlib.blas.Dger.dger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, + double[] a, int aOffset, int lda) { + org.netlib.blas.Dger.dger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) { + org.netlib.blas.Sger.sger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, + float[] a, int aOffset, int lda) { + org.netlib.blas.Sger.sger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void dsbmv(String uplo, int n, int k, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy) { + org.netlib.blas.Dsbmv.dsbmv(uplo, n, k, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dsbmv(String uplo, int n, int k, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + org.netlib.blas.Dsbmv.dsbmv(uplo, n, k, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void ssbmv(String uplo, int n, int k, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy) { + org.netlib.blas.Ssbmv.ssbmv(uplo, n, k, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void ssbmv(String uplo, int n, int k, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy) { + org.netlib.blas.Ssbmv.ssbmv(uplo, n, k, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta, + double[] y, int incy) { + org.netlib.blas.Dspmv.dspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx, + double beta, double[] y, int yOffset, int incy) { + org.netlib.blas.Dspmv.dspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta, + float[] y, int incy) { + org.netlib.blas.Sspmv.sspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx, + float beta, float[] y, int yOffset, int incy) { + org.netlib.blas.Sspmv.sspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap) { + org.netlib.blas.Dspr.dspr(uplo, n, alpha, x, 0, incx, ap, 0); + } + + @Override + public void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset) { + org.netlib.blas.Dspr.dspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset); + } + + @Override + public void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap) { + org.netlib.blas.Sspr.sspr(uplo, n, alpha, x, 0, incx, ap, 0); + } + + @Override + public void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset) { + org.netlib.blas.Sspr.sspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset); + } + + @Override + public void dspr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a) { + org.netlib.blas.Dspr2.dspr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0); + } + + @Override + public void dspr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, + int incy, double[] a, int aOffset) { + org.netlib.blas.Dspr2.dspr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset); + } + + @Override + public void sspr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a) { + org.netlib.blas.Sspr2.sspr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0); + } + + @Override + public void sspr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy, float[] a, int aOffset) { + org.netlib.blas.Sspr2.sspr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset); + } + + @Override + public void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy) { + org.netlib.blas.Dsymv.dsymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, int xOffset, + int incx, double beta, double[] y, int yOffset, int incy) { + org.netlib.blas.Dsymv.dsymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy) { + org.netlib.blas.Ssymv.ssymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + int incx, float beta, float[] y, int yOffset, int incy) { + org.netlib.blas.Ssymv.ssymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dsyr(String uplo, int n, double alpha, double[] x, int incx, double[] a, int lda) { + org.netlib.blas.Dsyr.dsyr(uplo, n, alpha, x, 0, incx, a, 0, lda); + } + + @Override + public void dsyr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] a, int aOffset, + int lda) { + org.netlib.blas.Dsyr.dsyr(uplo, n, alpha, x, xOffset, incx, a, aOffset, lda); + } + + @Override + public void ssyr(String uplo, int n, float alpha, float[] x, int incx, float[] a, int lda) { + org.netlib.blas.Ssyr.ssyr(uplo, n, alpha, x, 0, incx, a, 0, lda); + } + + @Override + public void ssyr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] a, int aOffset, + int lda) { + org.netlib.blas.Ssyr.ssyr(uplo, n, alpha, x, xOffset, incx, a, aOffset, lda); + } + + @Override + public void dsyr2(String uplo, int n, double alpha, double[] x, int incx, double[] y, int incy, + double[] a, int lda) { + org.netlib.blas.Dsyr2.dsyr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void dsyr2(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, + int incy, double[] a, int aOffset, int lda) { + org.netlib.blas.Dsyr2.dsyr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void ssyr2(String uplo, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) { + org.netlib.blas.Ssyr2.ssyr2(uplo, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void ssyr2(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda) { + org.netlib.blas.Ssyr2.ssyr2(uplo, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx) { + org.netlib.blas.Dtbmv.dtbmv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx); + } + + @Override + public void dtbmv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx) { + org.netlib.blas.Dtbmv.dtbmv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx) { + org.netlib.blas.Stbmv.stbmv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx); + } + + @Override + public void stbmv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx) { + org.netlib.blas.Stbmv.stbmv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int lda, double[] x, int incx) { + org.netlib.blas.Dtbsv.dtbsv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx); + } + + @Override + public void dtbsv(String uplo, String trans, String diag, int n, int k, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx) { + org.netlib.blas.Dtbsv.dtbsv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int lda, float[] x, int incx) { + org.netlib.blas.Stbsv.stbsv(uplo, trans, diag, n, k, a, 0, lda, x, 0, incx); + } + + @Override + public void stbsv(String uplo, String trans, String diag, int n, int k, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx) { + org.netlib.blas.Stbsv.stbsv(uplo, trans, diag, n, k, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void dtpmv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx) { + org.netlib.blas.Dtpmv.dtpmv(uplo, transa, diag, n, a, 0, x, 0, incx); + } + + @Override + public void dtpmv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x, + int xOffset, int incx) { + org.netlib.blas.Dtpmv.dtpmv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx); + } + + @Override + public void stpmv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx) { + org.netlib.blas.Stpmv.stpmv(uplo, transa, diag, n, a, 0, x, 0, incx); + } + + @Override + public void stpmv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x, + int xOffset, int incx) { + org.netlib.blas.Stpmv.stpmv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx); + } + + @Override + public void dtpsv(String uplo, String transa, String diag, int n, double[] a, double[] x, int incx) { + org.netlib.blas.Dtpsv.dtpsv(uplo, transa, diag, n, a, 0, x, 0, incx); + } + + @Override + public void dtpsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, double[] x, + int xOffset, int incx) { + org.netlib.blas.Dtpsv.dtpsv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx); + } + + @Override + public void stpsv(String uplo, String transa, String diag, int n, float[] a, float[] x, int incx) { + org.netlib.blas.Stpsv.stpsv(uplo, transa, diag, n, a, 0, x, 0, incx); + } + + @Override + public void stpsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, float[] x, + int xOffset, int incx) { + org.netlib.blas.Stpsv.stpsv(uplo, transa, diag, n, a, aOffset, x, xOffset, incx); + } + + @Override + public void dtrmv(String uplo, String trans, String diag, int n, double[] a, int lda, double[] x, int incx) { + org.netlib.blas.Dtrmv.dtrmv(uplo, trans, diag, n, a, 0, lda, x, 0, incx); + } + + @Override + public void dtrmv(String uplo, String trans, String diag, int n, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx) { + org.netlib.blas.Dtrmv.dtrmv(uplo, trans, diag, n, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void strmv(String uplo, String trans, String diag, int n, float[] a, int lda, float[] x, int incx) { + org.netlib.blas.Strmv.strmv(uplo, trans, diag, n, a, 0, lda, x, 0, incx); + } + + @Override + public void strmv(String uplo, String trans, String diag, int n, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx) { + org.netlib.blas.Strmv.strmv(uplo, trans, diag, n, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void dtrsv(String uplo, String transa, String diag, int n, double[] a, int lda, double[] x, int incx) { + org.netlib.blas.Dtrsv.dtrsv(uplo, transa, diag, n, a, 0, lda, x, 0, incx); + } + + @Override + public void dtrsv(String uplo, String transa, String diag, int n, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx) { + org.netlib.blas.Dtrsv.dtrsv(uplo, transa, diag, n, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void strsv(String uplo, String transa, String diag, int n, float[] a, int lda, float[] x, int incx) { + org.netlib.blas.Strsv.strsv(uplo, transa, diag, n, a, 0, lda, x, 0, incx); + } + + @Override + public void strsv(String uplo, String transa, String diag, int n, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx) { + org.netlib.blas.Strsv.strsv(uplo, transa, diag, n, a, aOffset, lda, x, xOffset, incx); + } + + @Override + public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc) { + org.netlib.blas.Dgemm.dgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset, + int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + org.netlib.blas.Dgemm.dgemm( + transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc) { + org.netlib.blas.Sgemm.sgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset, + int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + org.netlib.blas.Sgemm.sgemm( + transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc) { + org.netlib.blas.Dsymm.dsymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + org.netlib.blas.Dsymm.dsymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc) { + org.netlib.blas.Ssymm.ssymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + org.netlib.blas.Ssymm.ssymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc) { + org.netlib.blas.Dsyr2k.dsyr2k(uplo, trans, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void dsyr2k(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + org.netlib.blas.Dsyr2k.dsyr2k( + uplo, trans, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc) { + org.netlib.blas.Ssyr2k.ssyr2k(uplo, trans, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void ssyr2k(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + org.netlib.blas.Ssyr2k.ssyr2k( + uplo, trans, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int lda, double beta, + double[] c, int ldc) { + org.netlib.blas.Dsyrk.dsyrk(uplo, trans, n, k, alpha, a, 0, lda, beta, c, 0, ldc); + } + + @Override + public void dsyrk(String uplo, String trans, int n, int k, double alpha, double[] a, int aOffset, int lda, + double beta, double[] c, int cOffset, int ldc) { + org.netlib.blas.Dsyrk.dsyrk(uplo, trans, n, k, alpha, a, aOffset, lda, beta, c, cOffset, ldc); + } + + @Override + public void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int lda, float beta, + float[] c, int ldc) { + org.netlib.blas.Ssyrk.ssyrk(uplo, trans, n, k, alpha, a, 0, lda, beta, c, 0, ldc); + } + + @Override + public void ssyrk(String uplo, String trans, int n, int k, float alpha, float[] a, int aOffset, int lda, + float beta, float[] c, int cOffset, int ldc) { + org.netlib.blas.Ssyrk.ssyrk(uplo, trans, n, k, alpha, a, aOffset, lda, beta, c, cOffset, ldc); + } + + @Override + public void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int lda, double[] b, int ldb) { + org.netlib.blas.Dtrmm.dtrmm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb); + } + + @Override + public void dtrmm(String side, String uplo, String transa, String diag, int m, int n, double alpha, double[] a, + int aOffset, int lda, double[] b, int bOffset, int ldb) { + org.netlib.blas.Dtrmm.dtrmm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb); + } + + @Override + public void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, + float[] a, int lda, float[] b, int ldb) { + org.netlib.blas.Strmm.strmm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb); + } + + @Override + public void strmm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int aOffset, int lda, float[] b, int bOffset, int ldb) { + org.netlib.blas.Strmm.strmm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb); + } + + @Override + public void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha, + double[] a, int lda, double[] b, int ldb) { + org.netlib.blas.Dtrsm.dtrsm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb); + } + + @Override + public void dtrsm(String side, String uplo, String transa, String diag, int m, int n, double alpha, double[] a, + int aOffset, int lda, double[] b, int bOffset, int ldb) { + org.netlib.blas.Dtrsm.dtrsm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb); + } + + @Override + public void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, + float[] a, int lda, float[] b, int ldb) { + org.netlib.blas.Strsm.strsm(side, uplo, transa, diag, m, n, alpha, a, 0, lda, b, 0, ldb); + } + + @Override + public void strsm(String side, String uplo, String transa, String diag, int m, int n, float alpha, float[] a, + int aOffset, int lda, float[] b, int bOffset, int ldb) { + org.netlib.blas.Strsm.strsm(side, uplo, transa, diag, m, n, alpha, a, aOffset, lda, b, bOffset, ldb); + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java b/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java new file mode 100644 index 0000000000000000000000000000000000000000..6b82957276919d60be9ff0f87b9de07696d05010 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/VectorBLAS.java @@ -0,0 +1,424 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas; + +import com.huawei.vectorblas.blas1.doubleprecision.Dasum; +import com.huawei.vectorblas.blas1.doubleprecision.Daxpy; +import com.huawei.vectorblas.blas1.doubleprecision.Dcopy; +import com.huawei.vectorblas.blas1.doubleprecision.Ddot; +import com.huawei.vectorblas.blas1.doubleprecision.Dnrm2; +import com.huawei.vectorblas.blas1.doubleprecision.Drot; +import com.huawei.vectorblas.blas1.doubleprecision.Drotm; +import com.huawei.vectorblas.blas1.doubleprecision.Dscal; +import com.huawei.vectorblas.blas1.doubleprecision.Dswap; +import com.huawei.vectorblas.blas1.doubleprecision.Idamax; +import com.huawei.vectorblas.blas1.singleprecision.Isamax; +import com.huawei.vectorblas.blas1.singleprecision.Sasum; +import com.huawei.vectorblas.blas1.singleprecision.Saxpy; +import com.huawei.vectorblas.blas1.singleprecision.Scopy; +import com.huawei.vectorblas.blas1.singleprecision.Sdot; +import com.huawei.vectorblas.blas1.singleprecision.Snrm2; +import com.huawei.vectorblas.blas1.singleprecision.Srot; +import com.huawei.vectorblas.blas1.singleprecision.Srotm; +import com.huawei.vectorblas.blas1.singleprecision.Sscal; +import com.huawei.vectorblas.blas1.singleprecision.Sswap; +import com.huawei.vectorblas.blas2.doubleprecision.Dgemv; +import com.huawei.vectorblas.blas2.doubleprecision.Dger; +import com.huawei.vectorblas.blas2.doubleprecision.Dspmv; +import com.huawei.vectorblas.blas2.doubleprecision.Dspr; +import com.huawei.vectorblas.blas2.doubleprecision.Dsymv; +import com.huawei.vectorblas.blas2.singleprecision.Sgemv; +import com.huawei.vectorblas.blas2.singleprecision.Sger; +import com.huawei.vectorblas.blas2.singleprecision.Sspmv; +import com.huawei.vectorblas.blas2.singleprecision.Sspr; +import com.huawei.vectorblas.blas2.singleprecision.Ssymv; +import com.huawei.vectorblas.blas3.doubleprecision.Dgemm; +import com.huawei.vectorblas.blas3.doubleprecision.Dsymm; +import com.huawei.vectorblas.blas3.singleprecision.Sgemm; +import com.huawei.vectorblas.blas3.singleprecision.Ssymm; + +public class VectorBLAS extends F2jBLAS { + @Override + public double dasum(int n, double[] x, int incx) { + return Dasum.dasum(n, x, 0, incx); + } + + @Override + public double dasum(int n, double[] x, int xOffset, int incx) { + return Dasum.dasum(n, x, xOffset, incx); + } + + @Override + public float sasum(int n, float[] x, int incx) { + return Sasum.sasum(n, x, 0, incx); + } + + @Override + public float sasum(int n, float[] x, int xOffset, int incx) { + return Sasum.sasum(n, x, xOffset, incx); + } + + @Override + public void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy) { + Daxpy.daxpy(n, alpha, x, 0, incx, y, 0, incy); + } + + @Override + public void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + Daxpy.daxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void saxpy(int n, float alpha, float[] x, int incx, float[] y, int incy) { + Saxpy.saxpy(n, alpha, x, 0, incx, y, 0, incy); + } + + @Override + public void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + Saxpy.saxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void dcopy(int n, double[] x, int incx, double[] y, int incy) { + Dcopy.dcopy(n, x, 0, incx, y, 0, incy); + } + + @Override + public void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + Dcopy.dcopy(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void scopy(int n, float[] x, int incx, float[] y, int incy) { + Scopy.scopy(n, x, 0, incx, y, 0, incy); + } + + @Override + public void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + Scopy.scopy(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public double ddot(int n, double[] x, int incx, double[] y, int incy) { + return Ddot.ddot(n, x, 0, incx, y, 0, incy); + } + + @Override + public double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + return Ddot.ddot(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public float sdot(int n, float[] x, int incx, float[] y, int incy) { + return Sdot.sdot(n, x, 0, incx, y, 0, incy); + } + + @Override + public float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + return Sdot.sdot(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public float snrm2(int n, float[] x, int incx) { + return Snrm2.snrm2(n, x, 0, incx); + } + + @Override + public float snrm2(int n, float[] x, int xOffset, int incx) { + return Snrm2.snrm2(n, x, xOffset, incx); + } + + @Override + public double dnrm2(int n, double[] x, int incx) { + return Dnrm2.dnrm2(n, x, 0, incx); + } + + @Override + public double dnrm2(int n, double[] x, int xOffset, int incx) { + return Dnrm2.dnrm2(n, x, xOffset, incx); + } + + @Override + public void srot(int n, float[] x, int incx, float[] y, int incy, float c, float s) { + Srot.srot(n, x, 0, incx, y, 0, incy, c, s); + } + + @Override + public void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, float s) { + Srot.srot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + + @Override + public void drot(int n, double[] x, int incx, double[] y, int incy, double c, double s) { + Drot.drot(n, x, 0, incx, y, 0, incy, c, s); + } + + @Override + public void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, double s) { + Drot.drot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + + @Override + public void srotm(int n, float[] x, int incx, float[] y, int incy, float[] param) { + Srotm.srotm(n, x, 0, incx, y, 0, incy, param, 0); + } + + @Override + public void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param, + int paramOffset) { + Srotm.srotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset); + } + + @Override + public void drotm(int n, double[] x, int incx, double[] y, int incy, double[] param) { + Drotm.drotm(n, x, 0, incx, y, 0, incy, param, 0); + } + + @Override + public void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double[] param, + int paramOffset) { + Drotm.drotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset); + } + + @Override + public void sscal(int n, float alp, float[] x, int incx) { + Sscal.sscal(n, alp, x, 0, incx); + } + + @Override + public void sscal(int n, float alp, float[] x, int xOffset, int incx) { + Sscal.sscal(n, alp, x, xOffset, incx); + } + + @Override + public void dscal(int n, double alp, double[] x, int incx) { + Dscal.dscal(n, alp, x, 0, incx); + } + + @Override + public void dscal(int n, double alp, double[] x, int xOffset, int incx) { + Dscal.dscal(n, alp, x, xOffset, incx); + } + + @Override + public void sswap(int n, float[] x, int incx, float[] y, int incy) { + Sswap.sswap(n, x, 0, incx, y, 0, incy); + } + + @Override + public void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + Sswap.sswap(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public void dswap(int n, double[] x, int incx, double[] y, int incy) { + Dswap.dswap(n, x, 0, incx, y, 0, incy); + } + + @Override + public void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + Dswap.dswap(n, x, xOffset, incx, y, yOffset, incy); + } + + @Override + public int isamax(int n, float[] x, int incx) { + return Isamax.isamax(n, x, 0, incx); + } + + @Override + public int isamax(int n, float[] x, int xOffset, int incx) { + return Isamax.isamax(n, x, xOffset, incx); + } + + @Override + public int idamax(int n, double[] x, int incx) { + return Idamax.idamax(n, x, 0, incx); + } + + @Override + public int idamax(int n, double[] x, int xOffset, int incx) { + return Idamax.idamax(n, x, xOffset, incx); + } + + @Override + public void dgemv(String trans, int m, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy) { + Dgemv.dgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + Dgemv.dgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void sgemv(String trans, int m, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy) { + Sgemv.sgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy) { + Sgemv.sgemv(trans, m, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dger(int m, int n, double alpha, double[] x, int incx, double[] y, int incy, double[] a, int lda) { + Dger.dger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, + double[] a, int aOffset, int lda) { + Dger.dger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void sger(int m, int n, float alpha, float[] x, int incx, float[] y, int incy, float[] a, int lda) { + Sger.sger(m, n, alpha, x, 0, incx, y, 0, incy, a, 0, lda); + } + + @Override + public void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, + float[] a, int aOffset, int lda) { + Sger.sger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + + @Override + public void dspmv(String uplo, int n, double alpha, double[] a, double[] x, int incx, double beta, + double[] y, int incy) { + Dspmv.dspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx, + double beta, double[] y, int yOffset, int incy) { + Dspmv.dspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void sspmv(String uplo, int n, float alpha, float[] a, float[] x, int incx, float beta, + float[] y, int incy) { + Sspmv.sspmv(uplo, n, alpha, a, 0, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx, + float beta, float[] y, int yOffset, int incy) { + Sspmv.sspmv(uplo, n, alpha, a, aOffset, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] ap) { + Dspr.dspr(uplo, n, alpha, x, 0, incx, ap, 0); + } + + @Override + public void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, int aOffset) { + Dspr.dspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset); + } + + @Override + public void sspr(String uplo, int n, float alpha, float[] x, int incx, float[] ap) { + Sspr.sspr(uplo, n, alpha, x, 0, incx, ap, 0); + } + + @Override + public void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, int aOffset) { + Sspr.sspr(uplo, n, alpha, x, xOffset, incx, ap, aOffset); + } + + @Override + public void dsymv(String uplo, int n, double alpha, double[] a, int lda, double[] x, int incx, double beta, + double[] y, int incy) { + Dsymv.dsymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, int xOffset, + int incx, double beta, double[] y, int yOffset, int incy) { + Dsymv.dsymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void ssymv(String uplo, int n, float alpha, float[] a, int lda, float[] x, int incx, float beta, + float[] y, int incy) { + Ssymv.ssymv(uplo, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy); + } + + @Override + public void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + int incx, float beta, float[] y, int yOffset, int incy) { + Ssymv.ssymv(uplo, n, alpha, a, aOffset, lda, x, xOffset, incx, beta, y, yOffset, incy); + } + + @Override + public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc) { + Dgemm.dgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset, + int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + Dgemm.dgemm(transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc) { + Sgemm.sgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset, + int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + Sgemm.sgemm(transa, transb, m, n, k, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int lda, + double[] b, int ldb, double beta, double[] c, int ldc) { + Dsymm.dsymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + Dsymm.dsymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } + + @Override + public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int lda, + float[] b, int ldb, float beta, float[] c, int ldc) { + Ssymm.ssymm(side, uplo, m, n, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc); + } + + @Override + public void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + Ssymm.ssymm(side, uplo, m, n, alpha, a, aOffset, lda, b, bOffset, ldb, beta, c, cOffset, ldc); + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java new file mode 100644 index 0000000000000000000000000000000000000000..2d9f6789f11fc7f2d1f3a798d7b73224da5080af --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dasum.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Dasum { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + public static double dasum(int n, double[] x, int xOffset, int incx) { + if (n < 1 || incx < 1) { + return 0.0; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + return vecDasum(n, x, xOffset); + } + return norDasum(n, x, xOffset, incx); + } + + private static double vecDasum(int n, double[] x, int xOffset) { + int xIndex = 0; + DoubleVector resVec = DoubleVector.zero(DSPECIES); + int xLoopBound = DSPECIES.loopBound(n); + for (; xIndex < xLoopBound; xIndex += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xIndex + xOffset); + resVec = resVec.add(xv.abs()); + } + double result = resVec.reduceLanes(VectorOperators.ADD); + for (; xIndex < n; xIndex++) { + result += Math.abs(x[xIndex + xOffset]); + } + return result; + } + + private static double norDasum(int n, double[] x, int xOffset, int incx) { + double result = 0.0d; + int xIndex = 0; + for (int count = 0; count < n; count++) { + result += Math.abs(x[xIndex + xOffset]); + xIndex += incx; + } + return result; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java new file mode 100644 index 0000000000000000000000000000000000000000..fea183b58f45250efcaa51b00c64836b1db0c0b1 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Daxpy.java @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Daxpy { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void daxpy(int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, + int incy) { + if (n < 1 || BlasUtils.isZero(alpha)) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecDaxpy(n, alpha, x, xOffset, y, yOffset); + } else { + norDaxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void vecDaxpy(int n, double alpha, double[] x, int xOffset, double[] y, int yOffset) { + DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha); + int index = 0; + int loopBound = DSPECIES.loopBound(n); + for (; index < loopBound; index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + alphaVec.fma(xv, yv).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + y[index + yOffset] += alpha * x[index + xOffset]; + } + } + + private static void norDaxpy(int n, double alpha, double[] x, int xOffset, int incx, + double[] y, int yOffset, int incy) { + int xIndex = incx >= 0 ? 0 : (n - 1) * -incx; + int yIndex = incy >= 0 ? 0 : (n - 1) * -incy; + for (int count = 0; count < n; count++) { + y[yIndex + yOffset] += alpha * x[xIndex + xOffset]; + xIndex += incx; + yIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java new file mode 100644 index 0000000000000000000000000000000000000000..3ffc363604dab93fff480a5176f07a18b8f128cb --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dcopy.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +public class Dcopy { + public static void dcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + if (n <= 0) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if ((incx == 1 && incy == 1) || (incx == -1 && incy == -1)) { + System.arraycopy(x, xOffset, y, yOffset, n); + } else { + norDcopy(n, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void norDcopy(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + int xInitIndex = incx < 0 ? (-n + 1) * incx : 0; + int yInitIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int i = n; i > 0; --i) { + y[yInitIndex + yOffset] = x[xInitIndex + xOffset]; + xInitIndex += incx; + yInitIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java new file mode 100644 index 0000000000000000000000000000000000000000..b9742dfdf7c15851b95b713cc90e26d3384f1288 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Ddot.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Ddot { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static double ddot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + if (n < 1) { + return 0.0d; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + if (xOffset == 0 && yOffset == 0) { + return vecDdot(n, x, y); + } + return vecDdot(n, x, xOffset, y, yOffset); + } + return norDdot(n, x, xOffset, incx, y, yOffset, incy); + } + + private static double vecDdot(int n, double[] x, double[] y) { + int index = 0; + DoubleVector sumVec = DoubleVector.zero(DSPECIES); + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, x, index); + DoubleVector bv = DoubleVector.fromArray(DSPECIES, y, index); + sumVec = av.fma(bv, sumVec); + } + double sum = sumVec.reduceLanes(VectorOperators.ADD); + for (; index < n; index++) { + sum += x[index] * y[index]; + } + return sum; + } + + private static double vecDdot(int n, double[] x, int xOffset, double[] y, int yOffset) { + int index = 0; + DoubleVector sumVec = DoubleVector.zero(DSPECIES); + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector bv = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + sumVec = av.fma(bv, sumVec); + } + double sum = sumVec.reduceLanes(VectorOperators.ADD); + for (; index < n; index++) { + sum += x[index + xOffset] * y[index + yOffset]; + } + return sum; + } + + private static double norDdot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + int xIndex = incx >= 0 ? 0 : (n - 1) * -incx; + int yIndex = incy >= 0 ? 0 : (n - 1) * -incy; + double sum = 0.0d; + for (int count = 0; count < n; count++) { + sum += y[yIndex + yOffset] * x[xIndex + xOffset]; + xIndex += incx; + yIndex += incy; + } + return sum; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java new file mode 100644 index 0000000000000000000000000000000000000000..b6dd883161376cd63b1e3556f83fd9750122a223 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dnrm2.java @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +public class Dnrm2 { + private static final int MINEXPONENT = -1021; // -1021 is the minimum exponent in the model of the type of double. + private static final int MAXEXPONENT = 1024; // 1024 is the maximum exponent in the model of the type of double. + private static final int DIGITS = 53; // 53 is the number of significant binary digits of double. + public static double dnrm2(int n, double[] x, int xOffset, int incx) { + if (n < 1 || incx < 1) { + return 0.0; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + return norDnrm2(n, x, xOffset, incx); + } + + private static double norDnrm2(int n, double[] x, int xOffset, int incx) { + /* + * tSml, tBig, sSml, sBig are Blue's scaling constants. + */ + double tSml = Math.pow(2, Math.ceil((MINEXPONENT - 1) * 0.5d)); + double tBig = Math.pow(2, Math.floor((MAXEXPONENT - DIGITS + 1) * 0.5d)); + double sSml = Math.pow(2, -1 * Math.floor((MINEXPONENT - DIGITS) * 0.5d)); + double sBig = Math.pow(2, -1 * Math.ceil((MAXEXPONENT + DIGITS - 1) * 0.5d)); + boolean notBig = true; + double aSml = 0.0d; + double aMed = 0.0d; + double aBig = 0.0d; + + int xIndex = 0; + for (int count = 0; count < n; count++) { + double ax = Math.abs(x[xOffset + xIndex]); + if (ax > tBig) { + aBig += (ax * sBig) * (ax * sBig); + notBig = false; + } else if (ax < tSml) { + if (notBig) { + aSml += (ax * sSml) * (ax * sSml); + } + } else { + aMed += ax * ax; + } + xIndex += incx; + } + + double maxN = Double.MAX_VALUE; + double scaleVal; + double sumSq; + if (aBig > 0.0) { + if ((aMed > 0.0) || (aMed > maxN) || (Double.compare(aMed, aMed) != 0)) { + aBig += (aMed * sBig) * sBig; + } + scaleVal = 1.0d / sBig; + sumSq = aBig; + } else if (aSml > 0.0) { + if ((aMed > 0.0) || (aMed > maxN) || (Double.compare(aMed, aMed) != 0)) { + aMed = Math.sqrt(aMed); + aSml = Math.sqrt(aSml) / sSml; + double yMin = aSml > aMed ? aMed : aSml; + double yMax = aSml > aMed ? aSml : aMed; + scaleVal = 1.0d; + double yMinDevideMax = yMin / yMax; + sumSq = yMax * yMax * (1.0d + yMinDevideMax * yMinDevideMax); + } else { + scaleVal = 1.0d / sSml; + sumSq = aSml; + } + } else { + scaleVal = 1.0d; + sumSq = aMed; + } + return scaleVal * Math.sqrt(sumSq); + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java new file mode 100644 index 0000000000000000000000000000000000000000..bd6ba3a34ead3d60a02593c52d4b95c7b42579e3 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drot.java @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Drot { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void drot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, double c, + double s) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecDrot(n, x, xOffset, y, yOffset, c, s); + } else { + norDrot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + } + + private static void vecDrot(int n, double[] x, int xOffset, double[] y, int yOffset, double c, double s) { + DoubleVector cv = DoubleVector.broadcast(DSPECIES, c); + DoubleVector sv = DoubleVector.broadcast(DSPECIES, s); + DoubleVector nsv = DoubleVector.broadcast(DSPECIES, -s); + int index = 0; + int idxLoopBound = loopBound(n, DSPECIES.length() * 4); + for (; index < idxLoopBound; index += DSPECIES.length() * 4) { + DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() + xOffset); + DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() * 2 + xOffset); + DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, index + DSPECIES.length() * 3 + xOffset); + + DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() + yOffset); + DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() * 2 + yOffset); + DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, index + DSPECIES.length() * 3 + yOffset); + + xv0.fma(cv, yv0.mul(sv)).intoArray(x, index + xOffset); + xv1.fma(cv, yv1.mul(sv)).intoArray(x, index + DSPECIES.length() + xOffset); + xv2.fma(cv, yv2.mul(sv)).intoArray(x, index + DSPECIES.length() * 2 + xOffset); + xv3.fma(cv, yv3.mul(sv)).intoArray(x, index + DSPECIES.length() * 3 + xOffset); + + xv0.fma(nsv, yv0.mul(cv)).intoArray(y, index + yOffset); + xv1.fma(nsv, yv1.mul(cv)).intoArray(y, index + DSPECIES.length() + yOffset); + xv2.fma(nsv, yv2.mul(cv)).intoArray(y, index + DSPECIES.length() * 2 + yOffset); + xv3.fma(nsv, yv3.mul(cv)).intoArray(y, index + DSPECIES.length() * 3 + yOffset); + } + for (; index < DSPECIES.loopBound(n); index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + + xv.fma(cv, yv.mul(sv)).intoArray(x, index + xOffset); + xv.fma(nsv, yv.mul(cv)).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + double tmp = x[index + xOffset]; + x[index + xOffset] = c * x[index + xOffset] + s * y[index + yOffset]; + y[index + yOffset] = c * y[index + yOffset] - s * tmp; + } + } + + private static void norDrot(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, + double c, double s) { + int xInitIndex = incx < 0 ? (-n + 1) * incx : 0; + int yInitIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int num = n; num > 0; --num) { + double tmp = x[xInitIndex + xOffset]; + x[xInitIndex + xOffset] = c * tmp + s * y[yInitIndex + yOffset]; + y[yInitIndex + yOffset] = -s * tmp + c * y[yInitIndex + yOffset]; + xInitIndex += incx; + yInitIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java new file mode 100644 index 0000000000000000000000000000000000000000..592e76e765e4911a6c8b131678142fe6772cfbe2 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Drotm.java @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Drotm { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void drotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, + double[] param, int paramOffset) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("param", paramOffset, 4, param.length); + if (incx == 1 && incy == 1) { + vecDrotm(n, x, xOffset, y, yOffset, param, paramOffset); + } else { + norDrotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset); + } + } + + private static void vecDrotm(int n, double[] x, int xOffset, double[] y, int yOffset, double[] param, + int paramOffset) { + double flag = param[paramOffset]; + if (Double.compare(flag, -2.0d) == 0) { // If flag equals -2.0, do nothing and return directly. + return; + } + double h11 = param[paramOffset + 1]; + double h12 = 1.0d; + double h21 = -1.0d; + double h22 = param[paramOffset + 4]; + if (Double.compare(flag, -1.0d) == 0) { + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + } else if (BlasUtils.isZero(flag)) { + h11 = 1.0d; + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + h22 = 1.0d; + } + DoubleVector h11v = DoubleVector.broadcast(DSPECIES, h11); + DoubleVector h12v = DoubleVector.broadcast(DSPECIES, h12); + DoubleVector h21v = DoubleVector.broadcast(DSPECIES, h21); + DoubleVector h22v = DoubleVector.broadcast(DSPECIES, h22); + int index = 0; + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + (xv.mul(h11v)).add(yv.mul(h12v)).intoArray(x, index + xOffset); + (xv.mul(h21v)).add(yv.mul(h22v)).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + double xTmp = x[index + xOffset]; + x[index + xOffset] = h11 * xTmp + h12 * y[index + yOffset]; + y[index + yOffset] = h21 * xTmp + h22 * y[index + yOffset]; + } + } + + private static void norDrotm(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy, + double[] param, int paramOffset) { + double flag = param[paramOffset]; + if (Double.compare(flag, -2.0d) == 0) { // If flag equals -2.0, do nothing and return directly. + return; + } + double h11 = param[paramOffset + 1]; + double h12 = 1.0d; + double h21 = -1.0d; + double h22 = param[paramOffset + 4]; + if (Double.compare(flag, -1.0d) == 0) { + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + } else if (BlasUtils.isZero(flag)) { + h11 = 1.0d; + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + h22 = 1.0d; + } + int xInitIndex = incx < 0 ? (-n + 1) * incx : 0; + int yInitIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int num = n; num > 0; --num) { + double xTmp = x[xInitIndex + xOffset]; + x[xInitIndex + xOffset] = h11 * xTmp + h12 * y[yInitIndex + yOffset]; + y[yInitIndex + yOffset] = h21 * xTmp + h22 * y[yInitIndex + yOffset]; + xInitIndex += incx; + yInitIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java new file mode 100644 index 0000000000000000000000000000000000000000..a6c78751990e9afb2fb741f8602cf3e1d9b1ffcd --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dscal.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Dscal { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void dscal(int n, double alpha, double[] x, int xOffset, int incx) { + if (n < 1 || incx < 1 || Double.compare(alpha, 1.0) == 0) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + vecDscal(n, alpha, x, xOffset); + } else { + norDscal(n, alpha, x, xOffset, incx); + } + } + + private static void vecDscal(int n, double alpha, double[] x, int xOffset) { + DoubleVector alpv = DoubleVector.broadcast(DSPECIES, alpha); + int index = 0; + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + xv.mul(alpv).intoArray(x, index + xOffset); + } + for (; index < n; index++) { + x[index + xOffset] *= alpha; + } + } + + private static void norDscal(int n, double alpha, double[] x, int xOffset, int incx) { + int xInitIndex = 0; + for (int num = 0; num < n; num++) { + x[xInitIndex + xOffset] = alpha * x[xInitIndex + xOffset]; + xInitIndex += incx; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java new file mode 100644 index 0000000000000000000000000000000000000000..f9bcd33108117a20c368e072504e09578f62a383 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Dswap.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Dswap { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void dswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecDswap(n, x, xOffset, y, yOffset); + } else { + norDswap(n, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void vecDswap(int n, double[] x, int xOffset, double[] y, int yOffset) { + int index = 0; + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, index + yOffset); + xv.intoArray(y, index + yOffset); + yv.intoArray(x, index + xOffset); + } + for (; index < n; index++) { + double tmp = x[index + xOffset]; + x[index + xOffset] = y[index + yOffset]; + y[index + yOffset] = tmp; + } + } + + private static void norDswap(int n, double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + int xIndex = incx < 0 ? (-n + 1) * incx + 1 : 1; + int yIndex = incy < 0 ? (-n + 1) * incy + 1 : 1; + for (int num = n; num > 0; --num) { + double tmp = x[xIndex - 1 + xOffset]; + x[xIndex - 1 + xOffset] = y[yIndex - 1 + yOffset]; + y[yIndex - 1 + yOffset] = tmp; + xIndex += incx; + yIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java new file mode 100644 index 0000000000000000000000000000000000000000..6da1ee69e8cbd390d2b7f2b603944c75b4d87a3f --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/doubleprecision/Idamax.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Idamax { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static int idamax(int n, double[] x, int xOffset, int incx) { + if (n <= 0 || incx <= 0) { + return 0; + } + if (n == 1) { + return 1; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + return vecIdamax(n, x, xOffset); + } else { + return norIdamax(n, x, xOffset, incx); + } + } + + private static int vecIdamax(int n, double[] x, int xOffset) { + int indexOfMaxVec = 0; + double max = 0.0d; + int index = 0; + int idxLoopBound = DSPECIES.loopBound(n); + for (; index < idxLoopBound; index += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, index + xOffset); + double maxOfLanes = xv.abs().reduceLanes(VectorOperators.MAX); + if (max < maxOfLanes) { + max = maxOfLanes; + indexOfMaxVec = index; + } + } + int indexOfMaxValue = 0; + for (int j = indexOfMaxVec; j < indexOfMaxVec + DSPECIES.length(); j++) { + if (max <= Math.abs(x[j + xOffset])) { + indexOfMaxValue = j + 1; + break; + } + } + for (; index < n; index++) { + if (max < Math.abs(x[index + xOffset])) { + max = Math.abs(x[index + xOffset]); + indexOfMaxValue = index + 1; + } + } + return indexOfMaxValue; + } + + private static int norIdamax(int n, double[] x, int xOffset, int incx) { + int indexOfMaxValue = 1; + double max = Math.abs(x[xOffset]); + int xIndex = incx; + for (int j = 2; j <= n; ++j) { + double value = Math.abs(x[xIndex + xOffset]); + if (value > max) { + indexOfMaxValue = j; + max = value; + } + xIndex += incx; + } + return indexOfMaxValue; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java new file mode 100644 index 0000000000000000000000000000000000000000..88b7e735dae805fd89f9db1f9022908f09c8fca1 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Isamax.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Isamax { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static int isamax(int n, float[] x, int xOffset, int incx) { + if (n <= 0 || incx <= 0) { + return 0; + } + if (n == 1) { + return 1; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + return vecIsamax(n, x, xOffset); + } else { + return norIsamax(n, x, xOffset, incx); + } + } + + private static int vecIsamax(int n, float[] x, int xOffset) { + float max = 0.0f; + int indexOfMaxVec = 0; + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset); + float maxOfLanes = xv.abs().reduceLanes(VectorOperators.MAX); + if (max < maxOfLanes) { + max = maxOfLanes; + indexOfMaxVec = index; + } + } + int indexOfMaxValue = 0; + for (int j = indexOfMaxVec; j < indexOfMaxVec + SSPECIES.length(); j++) { + if (max <= Math.abs(x[j + xOffset])) { + indexOfMaxValue = j + 1; + break; + } + } + for (; index < n; index++) { + if (max < Math.abs(x[index + xOffset])) { + max = Math.abs(x[index + xOffset]); + indexOfMaxValue = index + 1; + } + } + return indexOfMaxValue; + } + + private static int norIsamax(int n, float[] x, int xOffset, int incx) { + int indexOfMaxValue = 1; + float max = Math.abs(x[xOffset]); + int xIndex = incx; + for (int j = 2; j <= n; j++) { + float val = Math.abs(x[xIndex + xOffset]); + if (val > max) { + indexOfMaxValue = j; + max = val; + } + xIndex += incx; + } + return indexOfMaxValue; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java new file mode 100644 index 0000000000000000000000000000000000000000..4574795989599d8379e9c6b0a774db0760c0e793 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sasum.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Sasum { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static float sasum(int n, float[] x, int xOffset, int incx) { + if (n < 1 || incx < 1) { + return 0.0f; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + return vecSasum(n, x, xOffset); + } + return norSasum(n, x, xOffset, incx); + } + + private static float vecSasum(int n, float[] x, int xOffset) { + int xIndex = 0; + FloatVector resVec = FloatVector.zero(SSPECIES); + int idxLoopBound = SSPECIES.loopBound(n); + for (; xIndex < idxLoopBound; xIndex += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, xIndex + xOffset); + resVec = resVec.add(xv.abs()); + } + float result = resVec.reduceLanes(VectorOperators.ADD); + for (; xIndex < n; xIndex++) { + result += Math.abs(x[xIndex + xOffset]); + } + return result; + } + + private static float norSasum(int n, float[] x, int xOffset, int incx) { + float result = 0.0f; + int xIndex = 0; + for (int count = 0; count < n; count++) { + result += Math.abs(x[xIndex + xOffset]); + xIndex += incx; + } + return result; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java new file mode 100644 index 0000000000000000000000000000000000000000..5dd3675a29f1dab6361c9f50be4289c65b99113a --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Saxpy.java @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Saxpy { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void saxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + if (n < 1 || BlasUtils.isZero(alpha)) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecSaxpy(n, alpha, x, xOffset, y, yOffset); + } else { + norSaxpy(n, alpha, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void vecSaxpy(int n, float alpha, float[] x, int xOffset, float[] y, int yOffset) { + FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha); + int index = 0; + int idxLoopBound = loopBound(n, (SSPECIES.length() * 4)); + for (; index < idxLoopBound; index += SSPECIES.length() * 4) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 2 + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 3 + xOffset); + + FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset); + FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() + yOffset); + FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 2 + yOffset); + FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 3 + yOffset); + + xv0.fma(alphaVec, yv0).intoArray(y, index + yOffset); + xv1.fma(alphaVec, yv1).intoArray(y, index + SSPECIES.length() + yOffset); + xv2.fma(alphaVec, yv2).intoArray(y, index + SSPECIES.length() * 2 + yOffset); + xv3.fma(alphaVec, yv3).intoArray(y, index + SSPECIES.length() * 3 + yOffset); + } + for (; index < SSPECIES.loopBound(n); index += SSPECIES.length()) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset); + xv0.fma(alphaVec, yv0).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + y[index + yOffset] += alpha * x[index + xOffset]; + } + } + + private static void norSaxpy(int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy) { + int xIndex = incx >= 0 ? 0 : (n - 1) * -incx; + int yIndex = incy >= 0 ? 0 : (n - 1) * -incy; + for (int count = 0; count < n; count++) { + y[yIndex + yOffset] += alpha * x[xIndex + xOffset]; + xIndex += incx; + yIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java new file mode 100644 index 0000000000000000000000000000000000000000..7393bf3f7542237138e3ba7b4744d1c100df61e4 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Scopy.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +public class Scopy { + public static void scopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + if (n <= 0) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + System.arraycopy(x, xOffset, y, yOffset, n); + } else { + norScopy(n, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void norScopy(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + int xIndex = incx < 0 ? (-n + 1) * incx : 0; + int yIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int i = n; i > 0; --i) { + y[yIndex + yOffset] = x[xIndex + xOffset]; + xIndex += incx; + yIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java new file mode 100644 index 0000000000000000000000000000000000000000..6a8f8343fd5aa659e8c5dc34b390483f15eaa6c4 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sdot.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Sdot { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static float sdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + if (n < 1) { + return 0.0f; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + if (xOffset == 0 && yOffset == 0) { + return vecSdot(n, x, y); + } + return vecSdot(n, x, xOffset, y, yOffset); + } + return norSdot(n, x, xOffset, incx, y, yOffset, incy); + } + + private static float vecSdot(int n, float[] x, float[] y) { + FloatVector sumVec = FloatVector.zero(SSPECIES); + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, x, index); + FloatVector bv = FloatVector.fromArray(SSPECIES, y, index); + sumVec = av.fma(bv, sumVec); + } + float sum = sumVec.reduceLanes(VectorOperators.ADD); + for (; index < n; index++) { + sum += x[index] * y[index]; + } + return sum; + } + + private static float vecSdot(int n, float[] x, int xOffset, float[] y, int yOffset) { + FloatVector sumVec = FloatVector.zero(SSPECIES); + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector bv = FloatVector.fromArray(SSPECIES, y, index + yOffset); + sumVec = av.fma(bv, sumVec); + } + float sum = sumVec.reduceLanes(VectorOperators.ADD); + for (; index < n; index++) { + sum += x[index + xOffset] * y[index + yOffset]; + } + return sum; + } + + private static float norSdot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + int xIndex = incx >= 0 ? 0 : (n - 1) * -incx; + int yIndex = incy >= 0 ? 0 : (n - 1) * -incy; + float sum = 0.0f; + for (int count = 0; count < n; count++) { + sum += y[yIndex + yOffset] * x[xIndex + xOffset]; + xIndex += incx; + yIndex += incy; + } + return sum; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java new file mode 100644 index 0000000000000000000000000000000000000000..9fad7f6b4ba43e724356652e317fe4314ffdc56c --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Snrm2.java @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +public class Snrm2 { + private static final int MINEXPONENT = -125; // -125 is the minimum exponent in the model of the type of float. + private static final int MAXEXPONENT = 128; // 128 is the maximum exponent in the model of the type of float. + private static final int DIGITS = 24; // 24 is the number of significant binary digits of float. + public static float snrm2(int n, float[] x, int xOffset, int incx) { + if (n < 1 || incx < 1) { + return 0.0f; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + return norSnrm2(n, x, xOffset, incx); + } + + private static float norSnrm2(int n, float[] x, int xOffset, int incx) { + /* + * tSml, tBig, sSml, sBig are Blue's scaling constants. + */ + float tSml = (float) Math.pow(2, Math.ceil((MINEXPONENT - 1) * 0.5f)); + float tBig = (float) Math.pow(2, Math.floor((MAXEXPONENT - DIGITS + 1) * 0.5f)); + float sSml = (float) Math.pow(2, -1 * Math.floor((MINEXPONENT - DIGITS) * 0.5f)); + float sBig = (float) Math.pow(2, -1 * Math.ceil((MAXEXPONENT + DIGITS - 1) * 0.5f)); + boolean notBig = true; + float aSml = 0.0f; + float aMed = 0.0f; + float aBig = 0.0f; + + int xIndex = 0; + for (int count = 0; count < n; count++) { + float ax = Math.abs(x[xOffset + xIndex]); + if (ax > tBig) { + aBig += (ax * sBig) * (ax * sBig); + notBig = false; + } else if (ax < tSml) { + if (notBig) { + aSml += (ax * sSml) * (ax * sSml); + } + } else { + aMed += ax * ax; + } + xIndex += incx; + } + + float maxN = Float.MAX_VALUE; + float scaleVal; + float sumSq; + if (aBig > 0.0) { + if ((aMed > 0.0) || (aMed > maxN) || (Float.compare(aMed, aMed) != 0)) { + aBig += (aMed * sBig) * sBig; + } + scaleVal = 1.0f / sBig; + sumSq = aBig; + } else if (aSml > 0.0) { + if ((aMed > 0.0) || (aMed > maxN) || (Float.compare(aMed, aMed) != 0)) { + aMed = (float) Math.sqrt(aMed); + aSml = (float) Math.sqrt(aSml) / sSml; + float yMin = aSml > aMed ? aMed : aSml; + float yMax = aSml > aMed ? aSml : aMed; + scaleVal = 1.0f; + float yMinDevideMax = yMin / yMax; + sumSq = yMax * yMax * (1.0f + yMinDevideMax * yMinDevideMax); + } else { + scaleVal = 1.0f / sSml; + sumSq = aSml; + } + } else { + scaleVal = 1.0f; + sumSq = aMed; + } + return scaleVal * (float) Math.sqrt(sumSq); + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java new file mode 100644 index 0000000000000000000000000000000000000000..240af85925d0537580a5fd1cb472e17144a3fdc2 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srot.java @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Srot { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void srot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float c, + float s) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecSrot(n, x, xOffset, y, yOffset, c, s); + } else { + norSrot(n, x, xOffset, incx, y, yOffset, incy, c, s); + } + } + + private static void norSrot(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, + float c, float s) { + int xIndex = incx < 0 ? (-n + 1) * incx : 0; + int yIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int num = n; num > 0; --num) { + float tmp = x[xIndex + xOffset]; + x[xIndex + xOffset] = c * tmp + s * y[yIndex + yOffset]; + y[yIndex + yOffset] = -s * tmp + c * y[yIndex + yOffset]; + xIndex += incx; + yIndex += incy; + } + } + + private static void vecSrot(int n, float[] x, int xOffset, float[] y, int yOffset, float c, float s) { + FloatVector cv = FloatVector.broadcast(SSPECIES, c); + FloatVector sv = FloatVector.broadcast(SSPECIES, s); + FloatVector nsv = FloatVector.broadcast(SSPECIES, -s); + int index = 0; + int idxLoopBound = loopBound(n, SSPECIES.length() * 4); + for (; index < idxLoopBound; index += SSPECIES.length() * 4) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 2 + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, index + SSPECIES.length() * 3 + xOffset); + + FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, index + yOffset); + FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() + yOffset); + FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 2 + yOffset); + FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, index + SSPECIES.length() * 3 + yOffset); + + xv0.fma(cv, yv0.mul(sv)).intoArray(x, index + xOffset); + xv1.fma(cv, yv1.mul(sv)).intoArray(x, index + SSPECIES.length() + xOffset); + xv2.fma(cv, yv2.mul(sv)).intoArray(x, index + SSPECIES.length() * 2 + xOffset); + xv3.fma(cv, yv3.mul(sv)).intoArray(x, index + SSPECIES.length() * 3 + xOffset); + + xv0.fma(nsv, yv0.mul(cv)).intoArray(y, index + yOffset); + xv1.fma(nsv, yv1.mul(cv)).intoArray(y, index + SSPECIES.length() + yOffset); + xv2.fma(nsv, yv2.mul(cv)).intoArray(y, index + SSPECIES.length() * 2 + yOffset); + xv3.fma(nsv, yv3.mul(cv)).intoArray(y, index + SSPECIES.length() * 3 + yOffset); + } + for (; index < SSPECIES.loopBound(n); index += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset); + xv.fma(cv, yv.mul(sv)).intoArray(x, index + xOffset); + xv.fma(nsv, yv.mul(cv)).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + float tmp = x[index + xOffset]; + x[index + xOffset] = c * tmp + s * y[index + yOffset]; + y[index + yOffset] = c * y[index + yOffset] - s * tmp; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java new file mode 100644 index 0000000000000000000000000000000000000000..2005a3fd81f8b5befa5a666cb82ee112ee525fee --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Srotm.java @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Srotm { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void srotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, float[] param, + int paramOffset) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("param", paramOffset, 4, param.length); + if (incx == 1 && incy == 1) { + vecSrotm(n, x, xOffset, y, yOffset, param, paramOffset); + } else { + norSrotm(n, x, xOffset, incx, y, yOffset, incy, param, paramOffset); + } + } + + private static void norSrotm(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy, + float[] param, int paramOffset) { + float flag = param[paramOffset]; + if (Float.compare(flag, -2.0f) == 0) { // If flag equals -2.0, do nothing and return directly. + return; + } + float h11 = param[paramOffset + 1]; + float h12 = 1.0f; + float h21 = -1.0f; + float h22 = param[paramOffset + 4]; + if (Float.compare(flag, -1.0f) == 0) { + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + } else if (BlasUtils.isZero(flag)) { + h11 = 1.0f; + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + h22 = 1.0f; + } + int xIndex = incx < 0 ? (-n + 1) * incx : 0; + int yIndex = incy < 0 ? (-n + 1) * incy : 0; + for (int num = n; num > 0; --num) { + float xTmp = x[xIndex + xOffset]; + x[xIndex + xOffset] = h11 * xTmp + h12 * y[yIndex + yOffset]; + y[yIndex + yOffset] = h21 * xTmp + h22 * y[yIndex + yOffset]; + xIndex += incx; + yIndex += incy; + } + } + + private static void vecSrotm(int n, float[] x, int xOffset, float[] y, int yOffset, float[] param, + int paramOffset) { + float flag = param[paramOffset]; + if (Float.compare(flag, -2.0f) == 0) { // If flag equals -2.0, do nothing and return directly. + return; + } + float h11 = param[paramOffset + 1]; + float h12 = 1.0f; + float h21 = -1.0f; + float h22 = param[paramOffset + 4]; + if (Float.compare(flag, -1.0f) == 0) { + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + } else if (BlasUtils.isZero(flag)) { + h11 = 1.0f; + h12 = param[paramOffset + 3]; + h21 = param[paramOffset + 2]; + h22 = 1.0f; + } + FloatVector h11v = FloatVector.broadcast(SSPECIES, h11); + FloatVector h12v = FloatVector.broadcast(SSPECIES, h12); + FloatVector h21v = FloatVector.broadcast(SSPECIES, h21); + FloatVector h22v = FloatVector.broadcast(SSPECIES, h22); + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset); + (xv.mul(h11v)).add(yv.mul(h12v)).intoArray(x, index + xOffset); + (xv.mul(h21v)).add(yv.mul(h22v)).intoArray(y, index + yOffset); + } + for (; index < n; index++) { + float xTmp = x[index + xOffset]; + x[index + xOffset] = h11 * xTmp + h12 * y[index + yOffset]; + y[index + yOffset] = h21 * xTmp + h22 * y[index + yOffset]; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java new file mode 100644 index 0000000000000000000000000000000000000000..b4571fc64bb8d6e273dcd0897ac00801fa9e871b --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sscal.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Sscal { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void sscal(int n, float alpha, float[] x, int xOffset, int incx) { + if (n < 1 || incx < 1 || Double.compare(alpha, 1.0) == 0) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + if (incx == 1) { + vecSscal(n, alpha, x, xOffset); + } else { + norSscal(n, alpha, x, xOffset, incx); + } + } + + private static void vecSscal(int n, float alpha, float[] x, int xOffset) { + FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha); + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset); + xv.mul(alphaVec).intoArray(x, index + xOffset); + } + for (; index < n; index += 1) { + x[index + xOffset] *= alpha; + } + } + + private static void norSscal(int n, float alpha, float[] x, int xOffset, int incx) { + int xIndex = 0; + for (int num = n; num > 0; --num) { + x[xIndex + xOffset] = alpha * x[xIndex + xOffset]; + xIndex += incx; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java new file mode 100644 index 0000000000000000000000000000000000000000..de9fd92cc7e390e75b393bca8cb6d9044904277b --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas1/singleprecision/Sswap.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas1.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Sswap { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void sswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + if (n < 1) { + return; + } + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + if (incx == 1 && incy == 1) { + vecSswap(n, x, xOffset, y, yOffset); + } else { + norSswap(n, x, xOffset, incx, y, yOffset, incy); + } + } + + private static void vecSswap(int n, float[] x, int xOffset, float[] y, int yOffset) { + int index = 0; + int idxLoopBound = SSPECIES.loopBound(n); + for (; index < idxLoopBound; index += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, index + xOffset); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, index + yOffset); + xv.intoArray(y, index + yOffset); + yv.intoArray(x, index + xOffset); + } + for (; index < n; index++) { + float tmp = x[index + xOffset]; + x[index + xOffset] = y[index + yOffset]; + y[index + yOffset] = tmp; + } + } + + private static void norSswap(int n, float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + int xIndex = incx < 0 ? (-n + 1) * incx + 1 : 1; + int yIndex = incy < 0 ? (-n + 1) * incy + 1 : 1; + for (int num = n; num > 0; --num) { + float tmp = x[xIndex - 1 + xOffset]; + x[xIndex - 1 + xOffset] = y[yIndex - 1 + yOffset]; + y[yIndex - 1 + yOffset] = tmp; + xIndex += incx; + yIndex += incy; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java new file mode 100644 index 0000000000000000000000000000000000000000..955e0bf19a38173096ec9f8ea47e7d47df555867 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/DblasLevel2.java @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class DblasLevel2 { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + protected static void dMulBeta(int size, double beta, double[] dy, int yOffset, int incy) { + if (incy == 1) { + DoubleVector betaVec = DoubleVector.broadcast(DSPECIES, beta); + int idx = 0; + int idxLoopBound = DSPECIES.loopBound(size); + for (; idx < idxLoopBound; idx += DSPECIES.length()) { + DoubleVector yv = DoubleVector.fromArray(DSPECIES, dy, idx + yOffset); + betaVec.mul(yv).intoArray(dy, idx + yOffset); + } + for (; idx < size; idx++) { + dy[idx + yOffset] = beta * dy[idx + yOffset]; + } + } else { + int yIndex = incy >= 0 ? 0 : (1 - size) * incy; + if (BlasUtils.isZero(beta)) { + for (int i = 0; i < size; i++, yIndex += incy) { + dy[yIndex + yOffset] = 0.0d; + } + } else { + for (int i = 0; i < size; i++, yIndex += incy) { + dy[yIndex + yOffset] = beta * dy[yIndex + yOffset]; + } + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java new file mode 100644 index 0000000000000000000000000000000000000000..368105cdf74997898943f3860de33cd9f9c16dee --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dgemv.java @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Dgemv { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void dgemv(String trans, int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + BlasUtils.checkParameter("DGEMV", 1, Lsame.lsame(trans, "N") || Lsame.lsame(trans, "T")); + BlasUtils.checkParameter("DGEMV", 2, m >= 0); + BlasUtils.checkParameter("DGEMV", 3, n >= 0); + BlasUtils.checkParameter("DGEMV", 6, lda >= Math.max(1, m)); + BlasUtils.checkParameter("DGEMV", 8, incx != 0); + BlasUtils.checkParameter("DGEMV", 11, incy != 0); + if (m == 0 || n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) { + return; + } + boolean transFlag = Lsame.lsame(trans, "N"); + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * ((transFlag ? n : m) - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * ((transFlag ? m : n) - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (n - 1) * lda + m - 1, a.length); + + if (Double.compare(beta, 1.0) != 0) { + DblasLevel2.dMulBeta(transFlag ? m : n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (transFlag) { + if (incy == 1) { + if (incx == 1) { + vecDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset); + } else { + vecDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset); + } + } else { + norDgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy); + } + } else { + if (incx == 1) { + if (incy == 1) { + vecDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset); + } else { + vecDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset, incy); + } + } else { + norDgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy); + } + } + } + + private static void vecDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, double[] y, int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + int rowUnrollLoopBound = loopBound(m, DSPECIES.length() * 4); + int rowLoopBound = loopBound(m, DSPECIES.length()); + for (; col < colLoopBound; col += 4) { + DoubleVector xv0 = DoubleVector.broadcast(DSPECIES, alpha * x[col + xOffset]); + DoubleVector xv1 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 1 + xOffset]); + DoubleVector xv2 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 2 + xOffset]); + DoubleVector xv3 = DoubleVector.broadcast(DSPECIES, alpha * x[col + 3 + xOffset]); + int row = 0; + for (; row < rowUnrollLoopBound; row += DSPECIES.length() * 4) { + DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() + yOffset); + DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 2 + yOffset); + DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 3 + yOffset); + + DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av10 = DoubleVector.fromArray( + DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset); + DoubleVector av20 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 2) + col * lda + aOffset); + DoubleVector av30 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 3) + col * lda + aOffset); + + DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset); + DoubleVector av11 = DoubleVector.fromArray( + DSPECIES, a, row + DSPECIES.length() + (col + 1) * lda + aOffset); + DoubleVector av21 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 1) * lda + aOffset); + DoubleVector av31 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 1) * lda + aOffset); + + DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset); + DoubleVector av12 = DoubleVector.fromArray( + DSPECIES, a, row + DSPECIES.length() + (col + 2) * lda + aOffset); + DoubleVector av22 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 2) * lda + aOffset); + DoubleVector av32 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 2) * lda + aOffset); + + DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset); + DoubleVector av13 = DoubleVector.fromArray( + DSPECIES, a, row + DSPECIES.length() + (col + 3) * lda + aOffset); + DoubleVector av23 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 2) + (col + 3) * lda + aOffset); + DoubleVector av33 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 3) + (col + 3) * lda + aOffset); + + av00.fma(xv0, av01.fma(xv1, av02.fma(xv2, av03.fma(xv3, yv0)))).intoArray(y, row + yOffset); + av10.fma(xv0, av11.fma(xv1, av12.fma(xv2, av13.fma(xv3, yv1)))) + .intoArray(y, row + DSPECIES.length() + yOffset); + av20.fma(xv0, av21.fma(xv1, av22.fma(xv2, av23.fma(xv3, yv2)))) + .intoArray(y, row + DSPECIES.length() * 2 + yOffset); + av30.fma(xv0, av31.fma(xv1, av32.fma(xv2, av33.fma(xv3, yv3)))) + .intoArray(y, row + DSPECIES.length() * 3 + yOffset); + } + for (; row < rowLoopBound; row += DSPECIES.length()) { + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset); + + av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))).intoArray(y, row + yOffset); + } + double x0 = alpha * x[col + xOffset]; + double x1 = alpha * x[col + 1 + xOffset]; + double x2 = alpha * x[col + 2 + xOffset]; + double x3 = alpha * x[col + 3 + xOffset]; + for (; row < m; row++) { + y[row + yOffset] += x0 * a[row + col * lda + aOffset] + + x1 * a[row + (col + 1) * lda + aOffset] + + x2 * a[row + (col + 2) * lda + aOffset] + + x3 * a[row + (col + 3) * lda + aOffset]; + } + } + for (; col < n; col++) { + if (!BlasUtils.isZero(x[col + xOffset])) { + DoubleVector bv = DoubleVector.broadcast(DSPECIES, alpha * x[col + xOffset]); + int row = 0; + for (; row < rowUnrollLoopBound; row += DSPECIES.length() * 4) { + DoubleVector yv0 = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + DoubleVector yv1 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() + yOffset); + DoubleVector yv2 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 2 + yOffset); + DoubleVector yv3 = DoubleVector.fromArray(DSPECIES, y, row + DSPECIES.length() * 3 + yOffset); + + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray( + DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 2) + col * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray( + DSPECIES, a, (row + DSPECIES.length() * 3) + col * lda + aOffset); + + av0.fma(bv, yv0).intoArray(y, row + yOffset); + av1.fma(bv, yv1).intoArray(y, row + DSPECIES.length() + yOffset); + av2.fma(bv, yv2).intoArray(y, row + DSPECIES.length() * 2 + yOffset); + av3.fma(bv, yv3).intoArray(y, row + DSPECIES.length() * 3 + yOffset); + } + for (; row < rowLoopBound; row += DSPECIES.length()) { + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + av.fma(bv, yv).intoArray(y, row + yOffset); + } + double alphaX = alpha * x[col + xOffset]; + for (; row < m; row++) { + y[row + yOffset] += alphaX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void vecDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx, double[] y, int yOffset) { + int xIndex = incx > 0 ? 0 : (n - 1) * (-incx); + for (int col = 0; col < n; col++, xIndex += incx) { + if (!BlasUtils.isZero(x[xIndex + xOffset])) { + double alphaMulX = alpha * x[xIndex + xOffset]; + DoubleVector alphaMulXv = DoubleVector.broadcast(DSPECIES, alphaMulX); + int row = 0; + int rowLoopBound = DSPECIES.loopBound(m); + for (; row < rowLoopBound; row += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector cv = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + av.fma(alphaMulXv, cv).intoArray(y, row + yOffset); + } + for (; row < m; row++) { + y[row + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void norDgemvN(int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, int incx, double[] y, int yOffset, int incy) { + int xIndex = incx > 0 ? 0 : (n - 1) * (-incx); + for (int col = 0; col < n; col++, xIndex += incx) { + if (!BlasUtils.isZero(x[xIndex + xOffset])) { + double alphaMulX = alpha * x[xIndex + xOffset]; + int yIndex = incy > 0 ? 0 : (m - 1) * (-incy); + for (int row = 0; row < m; row++, yIndex += incy) { + y[yIndex + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void vecDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, double[] y, int yOffset, int incy) { + int yIndex = incy > 0 ? 0 : (n - 1) * (-incy); + for (int row = 0; row < n; row++, yIndex += incy) { + DoubleVector cv = DoubleVector.zero(DSPECIES); + int col = 0; + int colLoopBound = DSPECIES.loopBound(m); + for (; col < colLoopBound; col += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset); + DoubleVector bv = DoubleVector.fromArray(DSPECIES, x, col + xOffset); + cv = av.fma(bv, cv); + } + double accum = cv.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum += a[col + row * lda + aOffset] * x[col + xOffset]; + } + y[yIndex + yOffset] += alpha * accum; + } + } + + private static void vecDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] x, int xOffset, double[] y, int yOffset) { + int row = 0; + int rowLoopBound = loopBound(n, 4); + int colUnrollLoopBound = loopBound(m, DSPECIES.length() * 4); + int colLoopBound = loopBound(m, DSPECIES.length()); + for (; row < rowLoopBound; row += 4) { + DoubleVector yv0 = DoubleVector.zero(DSPECIES); + DoubleVector yv1 = DoubleVector.zero(DSPECIES); + DoubleVector yv2 = DoubleVector.zero(DSPECIES); + DoubleVector yv3 = DoubleVector.zero(DSPECIES); + int col = 0; + for (; col < colUnrollLoopBound; col += DSPECIES.length() * 4) { + DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, col + xOffset); + DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, col + DSPECIES.length() + xOffset); + DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 2) + xOffset); + DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 3) + xOffset); + + DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset); + DoubleVector av10 = DoubleVector.fromArray( + DSPECIES, a, col + DSPECIES.length() + row * lda + aOffset); + DoubleVector av20 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 2) + row * lda + aOffset); + DoubleVector av30 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 3) + row * lda + aOffset); + yv0 = av00.fma(xv0, av10.fma(xv1, av20.fma(xv2, av30.fma(xv3, yv0)))); + + DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a, col + (row + 1) * lda + aOffset); + DoubleVector av11 = DoubleVector.fromArray( + DSPECIES, a, col + DSPECIES.length() + (row + 1) * lda + aOffset); + DoubleVector av21 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 1) * lda + aOffset); + DoubleVector av31 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 1) * lda + aOffset); + yv1 = av01.fma(xv0, av11.fma(xv1, av21.fma(xv2, av31.fma(xv3, yv1)))); + + DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a, col + (row + 2) * lda + aOffset); + DoubleVector av12 = DoubleVector.fromArray( + DSPECIES, a, col + DSPECIES.length() + (row + 2) * lda + aOffset); + DoubleVector av22 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 2) * lda + aOffset); + DoubleVector av32 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 2) * lda + aOffset); + yv2 = av02.fma(xv0, av12.fma(xv1, av22.fma(xv2, av32.fma(xv3, yv2)))); + + DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a, col + (row + 3) * lda + aOffset); + DoubleVector av13 = DoubleVector.fromArray( + DSPECIES, a, col + DSPECIES.length() + (row + 3) * lda + aOffset); + DoubleVector av23 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 2) + (row + 3) * lda + aOffset); + DoubleVector av33 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 3) + (row + 3) * lda + aOffset); + yv3 = av03.fma(xv0, av13.fma(xv1, av23.fma(xv2, av33.fma(xv3, yv3)))); + } + for (; col < colLoopBound; col += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, col + xOffset); + + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, col + (row + 1) * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, col + (row + 2) * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, col + (row + 3) * lda + aOffset); + + yv0 = av0.fma(xv, yv0); + yv1 = av1.fma(xv, yv1); + yv2 = av2.fma(xv, yv2); + yv3 = av3.fma(xv, yv3); + } + double accum0 = yv0.reduceLanes(VectorOperators.ADD); + double accum1 = yv1.reduceLanes(VectorOperators.ADD); + double accum2 = yv2.reduceLanes(VectorOperators.ADD); + double accum3 = yv3.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum0 += a[col + row * lda + aOffset] * x[col + xOffset]; + accum1 += a[col + (row + 1) * lda + aOffset] * x[col + xOffset]; + accum2 += a[col + (row + 2) * lda + aOffset] * x[col + xOffset]; + accum3 += a[col + (row + 3) * lda + aOffset] * x[col + xOffset]; + } + y[row + yOffset] += alpha * accum0; + y[row + 1 + yOffset] += alpha * accum1; + y[row + 2 + yOffset] += alpha * accum2; + y[row + 3 + yOffset] += alpha * accum3; + } + for (; row < n; row++) { + DoubleVector yv = DoubleVector.zero(DSPECIES); + int col = 0; + for (; col < colUnrollLoopBound; col += DSPECIES.length() * 4) { + DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, col + xOffset); + DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, col + DSPECIES.length() + xOffset); + DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 2) + xOffset); + DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, col + (DSPECIES.length() * 3) + xOffset); + + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray( + DSPECIES, a, col + DSPECIES.length() + row * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 2) + row * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray( + DSPECIES, a, col + (DSPECIES.length() * 3) + row * lda + aOffset); + + yv = av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))); + } + for (; col < colLoopBound; col += DSPECIES.length()) { + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, col + xOffset); + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, col + row * lda + aOffset); + yv = av.fma(xv, yv); + } + double accum = yv.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum += a[col + row * lda + aOffset] * x[col + xOffset]; + } + y[row + yOffset] += alpha * accum; + } + } + + private static void norDgemvT(int m, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double[] y, int yOffset, int incy) { + int yIndex = incy > 0 ? 0 : (n - 1) * (-incy); + for (int j = 0; j < n; j++, yIndex += incy) { + double accum = 0.0d; + int xIndex = incx > 0 ? 0 : (m - 1) * (-incx); + for (int i = 0; i < m; i++, xIndex += incx) { + accum += a[i + j * lda + aOffset] * x[xIndex + xOffset]; + } + y[yIndex + yOffset] += alpha * accum; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java new file mode 100644 index 0000000000000000000000000000000000000000..b79555108d2bc19af08994290cea4d4273616133 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dger.java @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class Dger { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + private static final int UNROLL_SIZE = 4; + + public static void dger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, int yOffset, + int incy, double[] a, int aOffset, int lda) { + BlasUtils.checkParameter("DGER", 1, m >= 0); + BlasUtils.checkParameter("DGER", 2, n >= 0); + BlasUtils.checkParameter("DGER", 5, incx != 0); + BlasUtils.checkParameter("DGER", 7, incy != 0); + BlasUtils.checkParameter("DGER", 9, lda >= Math.max(1, m)); + + if (m == 0 || n == 0 || BlasUtils.isZero(alpha)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (m - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (m - 1) + (n - 1) * lda, a.length); + + if (incx == 1 && incy == 1) { + vecDger(m, n, alpha, x, xOffset, y, yOffset, a, aOffset, lda); + } else { + normalDger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + } + + private static void vecDger(int m, int n, double alpha, double[] x, int xOffset, double[] y, int yOffset, + double[] a, int aOffset, int lda) { + int colLoopBound = loopBound(n, UNROLL_SIZE); + int rowLoopBound = loopBound(m, UNROLL_SIZE * DSPECIES.length()); + int col = 0; + for (; col < colLoopBound; col += UNROLL_SIZE) { + DoubleVector alphaMulYv0 = DoubleVector.broadcast(DSPECIES, alpha * y[col + yOffset]); + DoubleVector alphaMulYv1 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 1 + yOffset]); + DoubleVector alphaMulYv2 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 2 + yOffset]); + DoubleVector alphaMulYv3 = DoubleVector.broadcast(DSPECIES, alpha * y[col + 3 + yOffset]); + int row = 0; + for (; row < rowLoopBound; row += UNROLL_SIZE * DSPECIES.length()) { + DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, row + xOffset); + DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, row + DSPECIES.length() + xOffset); + DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, row + 2 * DSPECIES.length() + xOffset); + DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, row + 3 * DSPECIES.length() + xOffset); + + DoubleVector av00 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av01 = DoubleVector.fromArray(DSPECIES, a, + row + DSPECIES.length() + col * lda + aOffset); + DoubleVector av02 = DoubleVector.fromArray(DSPECIES, a, + row + 2 * DSPECIES.length() + col * lda + aOffset); + DoubleVector av03 = DoubleVector.fromArray(DSPECIES, a, + row + 3 * DSPECIES.length() + col * lda + aOffset); + + xv0.fma(alphaMulYv0, av00).intoArray(a, row + col * lda + aOffset); + xv1.fma(alphaMulYv0, av01).intoArray(a, row + DSPECIES.length() + col * lda + aOffset); + xv2.fma(alphaMulYv0, av02).intoArray(a, row + 2 * DSPECIES.length() + col * lda + aOffset); + xv3.fma(alphaMulYv0, av03).intoArray(a, row + 3 * DSPECIES.length() + col * lda + aOffset); + + DoubleVector av10 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset); + DoubleVector av11 = DoubleVector.fromArray(DSPECIES, a, + row + DSPECIES.length() + (col + 1) * lda + aOffset); + DoubleVector av12 = DoubleVector.fromArray(DSPECIES, a, + row + 2 * DSPECIES.length() + (col + 1) * lda + aOffset); + DoubleVector av13 = DoubleVector.fromArray(DSPECIES, a, + row + 3 * DSPECIES.length() + (col + 1) * lda + aOffset); + + xv0.fma(alphaMulYv1, av10).intoArray(a, row + (col + 1) * lda + aOffset); + xv1.fma(alphaMulYv1, av11).intoArray(a, row + DSPECIES.length() + (col + 1) * lda + aOffset); + xv2.fma(alphaMulYv1, av12).intoArray(a, row + 2 * DSPECIES.length() + (col + 1) * lda + aOffset); + xv3.fma(alphaMulYv1, av13).intoArray(a, row + 3 * DSPECIES.length() + (col + 1) * lda + aOffset); + + DoubleVector av20 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset); + DoubleVector av21 = DoubleVector.fromArray(DSPECIES, a, + row + DSPECIES.length() + (col + 2) * lda + aOffset); + DoubleVector av22 = DoubleVector.fromArray(DSPECIES, a, + row + 2 * DSPECIES.length() + (col + 2) * lda + aOffset); + DoubleVector av23 = DoubleVector.fromArray(DSPECIES, a, + row + 3 * DSPECIES.length() + (col + 2) * lda + aOffset); + + xv0.fma(alphaMulYv2, av20).intoArray(a, row + (col + 2) * lda + aOffset); + xv1.fma(alphaMulYv2, av21).intoArray(a, row + DSPECIES.length() + (col + 2) * lda + aOffset); + xv2.fma(alphaMulYv2, av22).intoArray(a, row + 2 * DSPECIES.length() + (col + 2) * lda + aOffset); + xv3.fma(alphaMulYv2, av23).intoArray(a, row + 3 * DSPECIES.length() + (col + 2) * lda + aOffset); + + DoubleVector av30 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset); + DoubleVector av31 = DoubleVector.fromArray(DSPECIES, a, + row + DSPECIES.length() + (col + 3) * lda + aOffset); + DoubleVector av32 = DoubleVector.fromArray(DSPECIES, a, + row + 2 * DSPECIES.length() + (col + 3) * lda + aOffset); + DoubleVector av33 = DoubleVector.fromArray(DSPECIES, a, + row + 3 * DSPECIES.length() + (col + 3) * lda + aOffset); + + xv0.fma(alphaMulYv3, av30).intoArray(a, row + (col + 3) * lda + aOffset); + xv1.fma(alphaMulYv3, av31).intoArray(a, row + DSPECIES.length() + (col + 3) * lda + aOffset); + xv2.fma(alphaMulYv3, av32).intoArray(a, row + 2 * DSPECIES.length() + (col + 3) * lda + aOffset); + xv3.fma(alphaMulYv3, av33).intoArray(a, row + 3 * DSPECIES.length() + (col + 3) * lda + aOffset); + } + double alphaMulY0 = alpha * y[col + yOffset]; + double alphaMulY1 = alpha * y[col + 1 + yOffset]; + double alphaMulY2 = alpha * y[col + 2 + yOffset]; + double alphaMulY3 = alpha * y[col + 3 + yOffset]; + for (; row < m; row++) { + a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset]; + a[row + (col + 1) * lda + aOffset] += alphaMulY1 * x[row + xOffset]; + a[row + (col + 2) * lda + aOffset] += alphaMulY2 * x[row + xOffset]; + a[row + (col + 3) * lda + aOffset] += alphaMulY3 * x[row + xOffset]; + } + } + for (; col < n; col++) { + DoubleVector alphaMulYv = DoubleVector.broadcast(DSPECIES, alpha * y[col + yOffset]); + int row = 0; + for (; row < rowLoopBound; row += UNROLL_SIZE * DSPECIES.length()) { + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + DSPECIES.length() + col * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, + row + 2 * DSPECIES.length() + col * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, + row + 3 * DSPECIES.length() + col * lda + aOffset); + + DoubleVector xv0 = DoubleVector.fromArray(DSPECIES, x, row + xOffset); + DoubleVector xv1 = DoubleVector.fromArray(DSPECIES, x, row + DSPECIES.length() + xOffset); + DoubleVector xv2 = DoubleVector.fromArray(DSPECIES, x, row + 2 * DSPECIES.length() + xOffset); + DoubleVector xv3 = DoubleVector.fromArray(DSPECIES, x, row + 3 * DSPECIES.length() + xOffset); + + xv0.fma(alphaMulYv, av0).intoArray(a, row + col * lda + aOffset); + xv1.fma(alphaMulYv, av1).intoArray(a, row + DSPECIES.length() + col * lda + aOffset); + xv2.fma(alphaMulYv, av2).intoArray(a, row + 2 * DSPECIES.length() + col * lda + aOffset); + xv3.fma(alphaMulYv, av3).intoArray(a, row + 3 * DSPECIES.length() + col * lda + aOffset); + } + double alphaMulY0 = alpha * y[col + yOffset]; + for (; row < m; row++) { + a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset]; + } + } + } + + private static void normalDger(int m, int n, double alpha, double[] x, int xOffset, int incx, double[] y, + int yOffset, int incy, double[] a, int aOffset, int lda) { + int xStartIndx = incx > 0 ? 0 : -(m - 1) * incx; + int yStartIndx = incy > 0 ? 0 : -(n - 1) * incy; + + for (int j = 0; j < n; j++, yStartIndx += incy) { + if (!BlasUtils.isZero(y[yStartIndx + yOffset])) { + for (int i = 0, xIndx = xStartIndx; i < m; i++, xIndx += incx) { + a[i + j * lda + aOffset] += alpha * x[xIndx + xOffset] * y[yStartIndx + yOffset]; + } + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java new file mode 100644 index 0000000000000000000000000000000000000000..b709571cfe4552e7af45ef9b151b4ee9d6bb7540 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspmv.java @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Dspmv { + public static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void dspmv(String uplo, int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, + int incx, double beta, double[] y, int yOffset, int incy) { + BlasUtils.checkParameter("DSPMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("DSPMV", 2, n >= 0); + BlasUtils.checkParameter("DSPMV", 6, incx != 0); + BlasUtils.checkParameter("DSPMV", 9, incy != 0); + + if (n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, a.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx); + int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy); + if (Double.compare(beta, 1.0d) != 0) { + DblasLevel2.dMulBeta(n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (uploFlag) { + if (incx == 1 && incy == 1) { + vecDspmvU(n, alpha, a, aOffset, x, xOffset, y, yOffset); + } else { + norDspmvU(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex); + } + } else { + if (incx == 1 && incy == 1) { + vecDspmvL(n, alpha, a, aOffset, x, xOffset, y, yOffset); + } else { + norDspmvL(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex); + } + } + } + + private static void vecDspmvU(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, double[] y, + int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + double alphaMulX0 = alpha * x[xOffset + col]; + double alphaMulX1 = alpha * x[xOffset + (col + 1)]; + double alphaMulX2 = alpha * x[xOffset + (col + 2)]; + double alphaMulX3 = alpha * x[xOffset + (col + 3)]; + DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1); + DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2); + DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3); + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + DoubleVector accumv1 = DoubleVector.zero(DSPECIES); + DoubleVector accumv2 = DoubleVector.zero(DSPECIES); + DoubleVector accumv3 = DoubleVector.zero(DSPECIES); + int row = 0; + for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) { + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * (col + 1) / 2); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 1) * ((col + 1) + 1) / 2); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 2) * ((col + 2) + 1) / 2); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 3) * ((col + 3) + 1) / 2); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + double accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD); + double accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD); + double accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + double a0 = a[aOffset + row + col * (col + 1) / 2]; + double a1 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2]; + double a2 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2]; + double a3 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2]; + double x0 = x[row + xOffset]; + y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset] + + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]); + accum0 += alpha * a0 * x0; + accum1 += alpha * a1 * x0; + accum2 += alpha * a2 * x0; + accum3 += alpha * a3 * x0; + } + double a00 = a[aOffset + row + col * (col + 1) / 2]; + double a01 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2]; + double a02 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2]; + double a03 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2]; + double a11 = a[aOffset + (row + 1) + (col + 1) * ((col + 1) + 1) / 2]; + double a12 = a[aOffset + (row + 1) + (col + 2) * ((col + 2) + 1) / 2]; + double a13 = a[aOffset + (row + 1) + (col + 3) * ((col + 3) + 1) / 2]; + double a22 = a[aOffset + (row + 2) + (col + 2) * ((col + 2) + 1) / 2]; + double a23 = a[aOffset + (row + 2) + (col + 3) * ((col + 3) + 1) / 2]; + double a33 = a[aOffset + (row + 3) + (col + 3) * ((col + 3) + 1) / 2]; + y[yOffset + col] += alphaMulX0 * a00 + alphaMulX1 * a01 + alphaMulX2 * a02 + alphaMulX3 * a03 + accum0; + y[yOffset + (col + 1)] += alphaMulX0 * a01 + alphaMulX1 * a11 + alphaMulX2 * a12 + alphaMulX3 * a13 + + accum1; + y[yOffset + (col + 2)] += alphaMulX0 * a02 + alphaMulX1 * a12 + alphaMulX2 * a22 + alphaMulX3 * a23 + + accum2; + y[yOffset + (col + 3)] += alphaMulX0 * a03 + alphaMulX1 * a13 + alphaMulX2 * a23 + alphaMulX3 * a33 + + accum3; + } + for (; col < n; col += 1) { + double alphaMulX0 = alpha * x[xOffset + col]; + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + int row = 0; + for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * (col + 1) / 2); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row); + av.fma(alphaMulXV0, yv).intoArray(y, yOffset + row); + accumv0 = av.fma(xv, accumv0); + } + double accum0 = accumv0.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + double a0 = a[aOffset + row + col * (col + 1) / 2]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += a[aOffset + row + col * (col + 1) / 2] * alphaMulX0 + alpha * accum0; + } + } + + private static void norDspmvU(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx, + double[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) { + int aIndx = 1; + for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) { + double alphaMulX = alpha * x[xIndx + xOffset]; + double accum = 0.0d; + + for (int row = aIndx, xi = xStartIndex, yi = yStartIndex; row < aIndx + col; row++, xi += incx, + yi += incy) { + y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset]; + accum += a[row - 1 + aOffset] * x[xi + xOffset]; + } + + y[yIndx + yOffset] = y[yIndx + yOffset] + alphaMulX * a[aIndx + col - 1 + aOffset] + alpha * accum; + aIndx += col + 1; + } + } + + private static void vecDspmvL(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, double[] y, + int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + int row = col; + double alphaMulX0 = alpha * x[xOffset + col]; + double alphaMulX1 = alpha * x[xOffset + (col + 1)]; + double alphaMulX2 = alpha * x[xOffset + (col + 2)]; + double alphaMulX3 = alpha * x[xOffset + (col + 3)]; + DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1); + DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2); + DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3); + double a00 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + double a10 = a[aOffset + (row + 1) - col * (col + 1) / 2 + n * col]; + double a20 = a[aOffset + (row + 2) - col * (col + 1) / 2 + n * col]; + double a30 = a[aOffset + (row + 3) - col * (col + 1) / 2 + n * col]; + double a11 = a[aOffset + (row + 1) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + double a21 = a[aOffset + (row + 2) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + double a31 = a[aOffset + (row + 3) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + double a22 = a[aOffset + (row + 2) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + double a32 = a[aOffset + (row + 3) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + double a33 = a[aOffset + (row + 3) - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)]; + double accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30; + double accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31; + double accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32; + double accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33; + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + DoubleVector accumv1 = DoubleVector.zero(DSPECIES); + DoubleVector accumv2 = DoubleVector.zero(DSPECIES); + DoubleVector accumv3 = DoubleVector.zero(DSPECIES); + row += 4; + for (; row <= (n - n % DSPECIES.length() - DSPECIES.length()); row += DSPECIES.length()) { + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row - col * (col + 1) / 2 + n * col); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, + aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, + aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, + aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD); + accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD); + accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD); + accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < n; row += 1) { + double a0 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + double a1 = a[aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + double a2 = a[aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + double a3 = a[aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)]; + y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3; + accum0 += alpha * x[xOffset + row] * a0; + accum1 += alpha * x[xOffset + row] * a1; + accum2 += alpha * x[xOffset + row] * a2; + accum3 += alpha * x[xOffset + row] * a3; + } + y[yOffset + col] += accum0; + y[yOffset + (col + 1)] += accum1; + y[yOffset + (col + 2)] += accum2; + y[yOffset + (col + 3)] += accum3; + } + for (; col < n; col += 1) { + double alphaMulX0 = alpha * x[xOffset + col]; + y[yOffset + col] += a[aOffset + col - col * (col + 1) / 2 + n * col] * alphaMulX0; + int row = col + 1; + double accum0 = 0.0d; + for (; row < n; row++) { + double a0 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += alpha * accum0; + } + } + + private static void norDspmvL(int n, double alpha, double[] a, int aOffset, double[] x, int xOffset, int incx, + double[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) { + int aIndx = 1; + for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) { + double alphaMulX = alpha * x[xIndx + xOffset]; + double accum = 0.0d; + y[yIndx + yOffset] += alphaMulX * a[aIndx - 1 + aOffset]; + + for (int row = aIndx + 1, xi = xIndx + incx, yi = yIndx + incy; row < aIndx + n - col; row++, xi += incx, + yi += incy) { + y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset]; + accum += a[row - 1 + aOffset] * x[xi + xOffset]; + } + y[yIndx + yOffset] += alpha * accum; + aIndx += n - col; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java new file mode 100644 index 0000000000000000000000000000000000000000..662e9282f860c32a214685130f3c2ed614805e63 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dspr.java @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import com.huawei.vectorblas.blas1.doubleprecision.Daxpy; +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Dspr { + public static void dspr(String uplo, int n, double alpha, double[] x, int xOffset, int incx, double[] ap, + int aOffset) { + BlasUtils.checkParameter("DSPR", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("DSPR", 2, n >= 0); + BlasUtils.checkParameter("DSPR", 5, incx != 0); + + if (n == 0 || BlasUtils.isZero(alpha)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, ap.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndx = incx >= 0 ? 0 : (1 - n) * incx; + + int cnt = 0; + if (incx >= 0) { + for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) { + int colCnt = uploFlag ? j + 1 : n - j; + if (!BlasUtils.isZero(x[xIndx + xOffset])) { + int kIndx = uploFlag ? 0 : xIndx; + Daxpy.daxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1); + } + cnt += colCnt; + } + } else { + for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) { + int colCnt = uploFlag ? j + 1 : n - j; + if (!BlasUtils.isZero(x[xIndx + xOffset])) { + int kIndx = uploFlag ? xIndx : 0; + Daxpy.daxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1); + } + cnt += colCnt; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java new file mode 100644 index 0000000000000000000000000000000000000000..d13c55e7dcba2ab451ecc6aeb0005184a2032de8 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/doubleprecision/Dsymv.java @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.doubleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Dsymv { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + + public static void dsymv(String uplo, int n, double alpha, double[] a, int aOffset, int lda, double[] x, + int xOffset, int incx, double beta, double[] y, int yOffset, int incy) { + BlasUtils.checkParameter("DSYMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("DSYMV", 2, n >= 0); + BlasUtils.checkParameter("DSYMV", 5, lda >= Math.max(1, n)); + BlasUtils.checkParameter("DSYMV", 7, incx != 0); + BlasUtils.checkParameter("DSYMV", 10, incy != 0); + + if (n == 0 || (BlasUtils.isZero(alpha) && Double.compare(beta, 1.0) == 0)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (n - 1) + (n - 1) * lda, a.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx); + int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy); + if (Double.compare(beta, 1.0d) != 0) { + DblasLevel2.dMulBeta(n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (uploFlag) { + if (incx == 1 && incy == 1) { + vecDsymvU(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda); + } else { + norDsymvU(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex); + } + } else if (incx == 1 && incy == 1) { + vecDsymvL(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda); + } else { + norDsymvL(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex); + } + } + + private static void vecDsymvU(int n, double[] x, int xOffset, double alpha, double[] y, int yOffset, double[] a, + int aOffset, int lda) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + double alphaMulX0 = alpha * x[col + xOffset]; + double alphaMulX1 = alpha * x[(col + 1) + xOffset]; + double alphaMulX2 = alpha * x[(col + 2) + xOffset]; + double alphaMulX3 = alpha * x[(col + 3) + xOffset]; + DoubleVector alphaXv0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + DoubleVector alphaXv1 = DoubleVector.broadcast(DSPECIES, alphaMulX1); + DoubleVector alphaXv2 = DoubleVector.broadcast(DSPECIES, alphaMulX2); + DoubleVector alphaXv3 = DoubleVector.broadcast(DSPECIES, alphaMulX3); + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + DoubleVector accumv1 = DoubleVector.zero(DSPECIES); + DoubleVector accumv2 = DoubleVector.zero(DSPECIES); + DoubleVector accumv3 = DoubleVector.zero(DSPECIES); + int row = 0; + for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) { + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, row + (col + 1) * lda + aOffset); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, row + (col + 2) * lda + aOffset); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, row + (col + 3) * lda + aOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, row + xOffset); + yv = av0.fma(alphaXv0, yv); + yv = av1.fma(alphaXv1, yv); + yv = av2.fma(alphaXv2, yv); + av3.fma(alphaXv3, yv).intoArray(y, row + yOffset); + accumv0 = av0.fma(xv, accumv0); + accumv1 = av1.fma(xv, accumv1); + accumv2 = av2.fma(xv, accumv2); + accumv3 = av3.fma(xv, accumv3); + } + double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + double accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD); + double accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD); + double accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + double a0 = a[row + col * lda + aOffset]; + double a1 = a[row + (col + 1) * lda + aOffset]; + double a2 = a[row + (col + 2) * lda + aOffset]; + double a3 = a[row + (col + 3) * lda + aOffset]; + double x0 = x[row + xOffset]; + y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset] + + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]); + accum0 += alpha * a0 * x0; + accum1 += alpha * a1 * x0; + accum2 += alpha * a2 * x0; + accum3 += alpha * a3 * x0; + } + double a00 = a[row + col * lda + aOffset]; + double a01 = a[row + (col + 1) * lda + aOffset]; + double a02 = a[row + (col + 2) * lda + aOffset]; + double a03 = a[row + (col + 3) * lda + aOffset]; + double a11 = a[(row + 1) + (col + 1) * lda + aOffset]; + double a12 = a[(row + 1) + (col + 2) * lda + aOffset]; + double a13 = a[(row + 1) + (col + 3) * lda + aOffset]; + double a22 = a[(row + 2) + (col + 2) * lda + aOffset]; + double a23 = a[(row + 2) + (col + 3) * lda + aOffset]; + double a33 = a[(row + 3) + (col + 3) * lda + aOffset]; + y[col + yOffset] += a00 * alphaMulX0 + a01 * alphaMulX1 + a02 * alphaMulX2 + a03 * alphaMulX3 + accum0; + y[(col + 1) + yOffset] += a01 * alphaMulX0 + a11 * alphaMulX1 + a12 * alphaMulX2 + a13 * alphaMulX3 + + accum1; + y[(col + 2) + yOffset] += a02 * alphaMulX0 + a12 * alphaMulX1 + a22 * alphaMulX2 + a23 * alphaMulX3 + + accum2; + y[(col + 3) + yOffset] += a03 * alphaMulX0 + a13 * alphaMulX1 + a23 * alphaMulX2 + a33 * alphaMulX3 + + accum3; + } + for (; col < n; col++) { + double alphaMulX0 = alpha * x[col + xOffset]; + DoubleVector alphaXv0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + int row = 0; + for (; row < col - col % DSPECIES.length(); row += DSPECIES.length()) { + DoubleVector av = DoubleVector.fromArray(DSPECIES, a, row + col * lda + aOffset); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, row + yOffset); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, row + xOffset); + av.fma(alphaXv0, yv).intoArray(y, row + yOffset); + accumv0 = av.fma(xv, accumv0); + } + double accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + double a0 = a[row + col * lda + aOffset]; + y[row + yOffset] += a0 * alphaMulX0; + accum0 += alpha * a0 * x[row + xOffset]; + } + y[col + yOffset] += a[row + col * lda + aOffset] * alphaMulX0 + accum0; + } + } + + private static void norDsymvU(int n, double[] x, int xOffset, int incx, double alpha, double[] y, int yOffset, + int incy, double[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) { + for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) { + double alphaMulX = alpha * x[xj + xOffset]; + double accum = 0.0d; + + for (int row = 0, xIndx = xStartIndex, yIndx = yStartIndex; row < col; row++, xIndx += incx, + yIndx += incy) { + y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + accum += a[row + col * lda + aOffset] * x[xIndx + xOffset]; + } + y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset] + alpha * accum; + } + } + + private static void vecDsymvL(int n, double[] x, int xOffset, double alpha, double[] y, int yOffset, double[] a, + int aOffset, int lda) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + int row = col; + double a00 = a[aOffset + row + col * lda]; + double a10 = a[aOffset + (row + 1) + col * lda]; + double a20 = a[aOffset + (row + 2) + col * lda]; + double a30 = a[aOffset + (row + 3) + col * lda]; + double a11 = a[aOffset + (row + 1) + (col + 1) * lda]; + double a21 = a[aOffset + (row + 2) + (col + 1) * lda]; + double a31 = a[aOffset + (row + 3) + (col + 1) * lda]; + double a22 = a[aOffset + (row + 2) + (col + 2) * lda]; + double a32 = a[aOffset + (row + 3) + (col + 2) * lda]; + double a33 = a[aOffset + (row + 3) + (col + 3) * lda]; + double alphaMulX0 = alpha * x[xOffset + col]; + double alphaMulX1 = alpha * x[xOffset + (col + 1)]; + double alphaMulX2 = alpha * x[xOffset + (col + 2)]; + double alphaMulX3 = alpha * x[xOffset + (col + 3)]; + double accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30; + double accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31; + double accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32; + double accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33; + DoubleVector alphaMulXV0 = DoubleVector.broadcast(DSPECIES, alphaMulX0); + DoubleVector alphaMulXV1 = DoubleVector.broadcast(DSPECIES, alphaMulX1); + DoubleVector alphaMulXV2 = DoubleVector.broadcast(DSPECIES, alphaMulX2); + DoubleVector alphaMulXV3 = DoubleVector.broadcast(DSPECIES, alphaMulX3); + DoubleVector accumv0 = DoubleVector.zero(DSPECIES); + DoubleVector accumv1 = DoubleVector.zero(DSPECIES); + DoubleVector accumv2 = DoubleVector.zero(DSPECIES); + DoubleVector accumv3 = DoubleVector.zero(DSPECIES); + row += 4; + for (; row <= (n - n % DSPECIES.length() - DSPECIES.length()); row += DSPECIES.length()) { + DoubleVector av0 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + col * lda); + DoubleVector av1 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 1) * lda); + DoubleVector av2 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 2) * lda); + DoubleVector av3 = DoubleVector.fromArray(DSPECIES, a, aOffset + row + (col + 3) * lda); + DoubleVector yv = DoubleVector.fromArray(DSPECIES, y, yOffset + row); + DoubleVector xv = DoubleVector.fromArray(DSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD); + accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD); + accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD); + accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < n; row += 1) { + double a0 = a[aOffset + row + col * lda]; + double a1 = a[aOffset + row + (col + 1) * lda]; + double a2 = a[aOffset + row + (col + 2) * lda]; + double a3 = a[aOffset + row + (col + 3) * lda]; + y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3; + accum0 += alpha * x[xOffset + row] * a0; + accum1 += alpha * x[xOffset + row] * a1; + accum2 += alpha * x[xOffset + row] * a2; + accum3 += alpha * x[xOffset + row] * a3; + } + y[yOffset + col] += accum0; + y[yOffset + (col + 1)] += accum1; + y[yOffset + (col + 2)] += accum2; + y[yOffset + (col + 3)] += accum3; + } + for (; col < n; col += 1) { + double alphaMulX0 = alpha * x[xOffset + col]; + y[yOffset + col] += a[aOffset + col + col * lda] * alphaMulX0; + int row = col + 1; + double accum0 = 0.0d; + for (; row < n; row++) { + double a0 = a[aOffset + row + col * lda]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += alpha * accum0; + } + } + + private static void norDsymvL(int n, double[] x, int xOffset, int incx, double alpha, double[] y, int yOffset, + int incy, double[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) { + for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) { + double alphaMulX = alpha * x[xj + xOffset]; + y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset]; + double accum = 0.0d; + + for (int row = col + 1, xIndx = xj + incx, yIndx = yj + incy; row < n; row++, xIndx += incx, + yIndx += incy) { + y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + accum += a[row + col * lda + aOffset] * x[xIndx + xOffset]; + } + y[yj + yOffset] += alpha * accum; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java new file mode 100644 index 0000000000000000000000000000000000000000..6f1cf5f1e4495fb1a262c4cf6446cafd04177e06 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/SblasLevel2.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class SblasLevel2 { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + protected static void sMulBeta(int size, float beta, float[] sy, int yOffset, int incy) { + if (incy == 1) { + FloatVector betaVec = FloatVector.broadcast(SSPECIES, beta); + int idx = 0; + for (; idx < SSPECIES.loopBound(size); idx += SSPECIES.length()) { + FloatVector yv = FloatVector.fromArray(SSPECIES, sy, idx + yOffset); + betaVec.mul(yv).intoArray(sy, idx + yOffset); + } + for (; idx < size; idx++) { + sy[idx + yOffset] = beta * sy[idx + yOffset]; + } + } else { + int yIndex = incy >= 0 ? 0 : (1 - size) * incy; + if (BlasUtils.isZero(beta)) { + for (int i = 0; i < size; i++, yIndex += incy) { + sy[yIndex + yOffset] = 0.0f; + } + } else { + for (int i = 0; i < size; i++, yIndex += incy) { + sy[yIndex + yOffset] = beta * sy[yIndex + yOffset]; + } + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java new file mode 100644 index 0000000000000000000000000000000000000000..a3b8cdc3965025ac118dfce27aa54bc9e11ae869 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sgemv.java @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Sgemv { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void sgemv(String trans, int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, + int xOffset, int incx, float beta, float[] y, int yOffset, int incy) { + BlasUtils.checkParameter("SGEMV", 1, Lsame.lsame(trans, "N") || Lsame.lsame(trans, "T")); + BlasUtils.checkParameter("SGEMV", 2, m >= 0); + BlasUtils.checkParameter("SGEMV", 3, n >= 0); + BlasUtils.checkParameter("SGEMV", 6, lda >= Math.max(1, m)); + BlasUtils.checkParameter("SGEMV", 8, incx != 0); + BlasUtils.checkParameter("SGEMV", 11, incy != 0); + if (m == 0 || n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) { + return; + } + boolean transFlag = Lsame.lsame(trans, "N"); + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * ((transFlag ? n : m) - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * ((transFlag ? m : n) - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (n - 1) * lda + m - 1, a.length); + + if (Float.compare(beta, 1.0f) != 0) { + SblasLevel2.sMulBeta(transFlag ? m : n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (transFlag) { + if (incy == 1) { + if (incx == 1) { + vecSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset); + } else { + vecSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset); + } + } else { + norSgemvN(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy); + } + } else { + if (incx == 1) { + if (incy == 1) { + vecSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset); + } else { + vecSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, y, yOffset, incy); + } + } else { + norSgemvT(m, n, alpha, a, aOffset, lda, x, xOffset, incx, y, yOffset, incy); + } + } + } + + private static void vecSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, float[] y, int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + int rowUnrollLoopBound = loopBound(m, SSPECIES.length() * 4); + int rowLoopBound = loopBound(m, SSPECIES.length()); + for (; col < colLoopBound; col += 4) { + FloatVector xv0 = FloatVector.broadcast(SSPECIES, alpha * x[col + xOffset]); + FloatVector xv1 = FloatVector.broadcast(SSPECIES, alpha * x[col + 1 + xOffset]); + FloatVector xv2 = FloatVector.broadcast(SSPECIES, alpha * x[col + 2 + xOffset]); + FloatVector xv3 = FloatVector.broadcast(SSPECIES, alpha * x[col + 3 + xOffset]); + int row = 0; + for (; row < rowUnrollLoopBound; row += SSPECIES.length() * 4) { + FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, row + yOffset); + FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() + yOffset); + FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 2 + yOffset); + FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 3 + yOffset); + + FloatVector av00 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av10 = FloatVector.fromArray( + SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset); + FloatVector av20 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 2) + col * lda + aOffset); + FloatVector av30 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 3) + col * lda + aOffset); + + FloatVector av01 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset); + FloatVector av11 = FloatVector.fromArray( + SSPECIES, a, row + SSPECIES.length() + (col + 1) * lda + aOffset); + FloatVector av21 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 1) * lda + aOffset); + FloatVector av31 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 1) * lda + aOffset); + + FloatVector av02 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset); + FloatVector av12 = FloatVector.fromArray( + SSPECIES, a, row + SSPECIES.length() + (col + 2) * lda + aOffset); + FloatVector av22 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 2) * lda + aOffset); + FloatVector av32 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 2) * lda + aOffset); + + FloatVector av03 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset); + FloatVector av13 = FloatVector.fromArray( + SSPECIES, a, row + SSPECIES.length() + (col + 3) * lda + aOffset); + FloatVector av23 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 2) + (col + 3) * lda + aOffset); + FloatVector av33 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 3) + (col + 3) * lda + aOffset); + + av00.fma(xv0, av01.fma(xv1, av02.fma(xv2, av03.fma(xv3, yv0)))).intoArray(y, row + yOffset); + av10.fma(xv0, av11.fma(xv1, av12.fma(xv2, av13.fma(xv3, yv1)))) + .intoArray(y, row + SSPECIES.length() + yOffset); + av20.fma(xv0, av21.fma(xv1, av22.fma(xv2, av23.fma(xv3, yv2)))) + .intoArray(y, row + SSPECIES.length() * 2 + yOffset); + av30.fma(xv0, av31.fma(xv1, av32.fma(xv2, av33.fma(xv3, yv3)))) + .intoArray(y, row + SSPECIES.length() * 3 + yOffset); + } + for (; row < rowLoopBound; row += SSPECIES.length()) { + FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset); + + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset); + + av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))).intoArray(y, row + yOffset); + } + float x0 = alpha * x[col + xOffset]; + float x1 = alpha * x[col + 1 + xOffset]; + float x2 = alpha * x[col + 2 + xOffset]; + float x3 = alpha * x[col + 3 + xOffset]; + for (; row < m; row++) { + y[row + yOffset] += x0 * a[row + col * lda + aOffset] + + x1 * a[row + (col + 1) * lda + aOffset] + + x2 * a[row + (col + 2) * lda + aOffset] + + x3 * a[row + (col + 3) * lda + aOffset]; + } + } + for (; col < n; col++) { + if (!BlasUtils.isZero(x[col + xOffset])) { + FloatVector bv = FloatVector.broadcast(SSPECIES, alpha * x[col + xOffset]); + int row = 0; + for (; row < rowUnrollLoopBound; row += SSPECIES.length() * 4) { + FloatVector yv0 = FloatVector.fromArray(SSPECIES, y, row + yOffset); + FloatVector yv1 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() + yOffset); + FloatVector yv2 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 2 + yOffset); + FloatVector yv3 = FloatVector.fromArray(SSPECIES, y, row + SSPECIES.length() * 3 + yOffset); + + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av1 = FloatVector.fromArray( + SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset); + FloatVector av2 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 2) + col * lda + aOffset); + FloatVector av3 = FloatVector.fromArray( + SSPECIES, a, (row + SSPECIES.length() * 3) + col * lda + aOffset); + + av0.fma(bv, yv0).intoArray(y, row + yOffset); + av1.fma(bv, yv1).intoArray(y, row + SSPECIES.length() + yOffset); + av2.fma(bv, yv2).intoArray(y, row + SSPECIES.length() * 2 + yOffset); + av3.fma(bv, yv3).intoArray(y, row + SSPECIES.length() * 3 + yOffset); + } + for (; row < rowLoopBound; row += SSPECIES.length()) { + FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset); + FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + bv.fma(av, yv).intoArray(y, row + yOffset); + } + float alphaX = alpha * x[col + xOffset]; + for (; row < m; row++) { + y[row + yOffset] += alphaX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void vecSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx, float[] y, int yOffset) { + int xIndex = incx > 0 ? 0 : (n - 1) * (-incx); + int rowLoopBound = SSPECIES.loopBound(m); + for (int col = 0; col < n; col++, xIndex += incx) { + if (!BlasUtils.isZero(x[xIndex + xOffset])) { + float alphaMulX = alpha * x[xIndex + xOffset]; + FloatVector alphaMulXv = FloatVector.broadcast(SSPECIES, alphaMulX); + int row = 0; + for (; row < rowLoopBound; row += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector cv = FloatVector.fromArray(SSPECIES, y, row + yOffset); + av.fma(alphaMulXv, cv).intoArray(y, row + yOffset); + } + for (; row < m; row++) { + y[row + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void norSgemvN(int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, int incx, float[] y, int yOffset, int incy) { + int xIndex = incx > 0 ? 0 : (n - 1) * (-incx); + for (int col = 0; col < n; col++, xIndex += incx) { + if (!BlasUtils.isZero(x[xIndex + xOffset])) { + float alphaMulX = alpha * x[xIndex + xOffset]; + int yIndex = incy > 0 ? 0 : (m - 1) * (-incy); + for (int row = 0; row < m; row++, yIndex += incy) { + y[yIndex + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + } + } + } + } + + private static void vecSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + float[] y, int yOffset, int incy) { + int yIndex = incy > 0 ? 0 : (n - 1) * (-incy); + int colLoopBound = SSPECIES.loopBound(m); + for (int row = 0; row < n; row++, yIndex += incy) { + FloatVector cv = FloatVector.zero(SSPECIES); + int col = 0; + for (; col < colLoopBound; col += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset); + FloatVector bv = FloatVector.fromArray(SSPECIES, x, col + xOffset); + cv = av.fma(bv, cv); + } + float accum = cv.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum += a[col + row * lda + aOffset] * x[col + xOffset]; + } + y[yIndex + yOffset] += alpha * accum; + } + } + + private static void vecSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] x, int xOffset, float[] y, int yOffset) { + int row = 0; + int rowLoopBound = loopBound(n, 4); + int colUnrollLoopBound = loopBound(m, SSPECIES.length() * 4); + int colLoopBound = loopBound(m, SSPECIES.length()); + for (; row < rowLoopBound; row += 4) { + FloatVector yv0 = FloatVector.zero(SSPECIES); + FloatVector yv1 = FloatVector.zero(SSPECIES); + FloatVector yv2 = FloatVector.zero(SSPECIES); + FloatVector yv3 = FloatVector.zero(SSPECIES); + int col = 0; + for (; col < colUnrollLoopBound; col += SSPECIES.length() * 4) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, col + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, col + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 2) + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 3) + xOffset); + + FloatVector av00 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset); + FloatVector av10 = FloatVector.fromArray( + SSPECIES, a, col + SSPECIES.length() + row * lda + aOffset); + FloatVector av20 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 2) + row * lda + aOffset); + FloatVector av30 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 3) + row * lda + aOffset); + yv0 = av00.fma(xv0, av10.fma(xv1, av20.fma(xv2, av30.fma(xv3, yv0)))); + + FloatVector av01 = FloatVector.fromArray(SSPECIES, a, col + (row + 1) * lda + aOffset); + FloatVector av11 = FloatVector.fromArray( + SSPECIES, a, col + SSPECIES.length() + (row + 1) * lda + aOffset); + FloatVector av21 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 1) * lda + aOffset); + FloatVector av31 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 1) * lda + aOffset); + yv1 = av01.fma(xv0, av11.fma(xv1, av21.fma(xv2, av31.fma(xv3, yv1)))); + + FloatVector av02 = FloatVector.fromArray(SSPECIES, a, col + (row + 2) * lda + aOffset); + FloatVector av12 = FloatVector.fromArray( + SSPECIES, a, col + SSPECIES.length() + (row + 2) * lda + aOffset); + FloatVector av22 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 2) * lda + aOffset); + FloatVector av32 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 2) * lda + aOffset); + yv2 = av02.fma(xv0, av12.fma(xv1, av22.fma(xv2, av32.fma(xv3, yv2)))); + + FloatVector av03 = FloatVector.fromArray(SSPECIES, a, col + (row + 3) * lda + aOffset); + FloatVector av13 = FloatVector.fromArray( + SSPECIES, a, col + SSPECIES.length() + (row + 3) * lda + aOffset); + FloatVector av23 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 2) + (row + 3) * lda + aOffset); + FloatVector av33 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 3) + (row + 3) * lda + aOffset); + yv3 = av03.fma(xv0, av13.fma(xv1, av23.fma(xv2, av33.fma(xv3, yv3)))); + } + for (; col < colLoopBound; col += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, col + xOffset); + + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, col + (row + 1) * lda + aOffset); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, col + (row + 2) * lda + aOffset); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, col + (row + 3) * lda + aOffset); + + yv0 = av0.fma(xv, yv0); + yv1 = av1.fma(xv, yv1); + yv2 = av2.fma(xv, yv2); + yv3 = av3.fma(xv, yv3); + } + float accum0 = yv0.reduceLanes(VectorOperators.ADD); + float accum1 = yv1.reduceLanes(VectorOperators.ADD); + float accum2 = yv2.reduceLanes(VectorOperators.ADD); + float accum3 = yv3.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum0 += a[col + row * lda + aOffset] * x[col + xOffset]; + accum1 += a[col + (row + 1) * lda + aOffset] * x[col + xOffset]; + accum2 += a[col + (row + 2) * lda + aOffset] * x[col + xOffset]; + accum3 += a[col + (row + 3) * lda + aOffset] * x[col + xOffset]; + } + y[row + yOffset] += alpha * accum0; + y[row + 1 + yOffset] += alpha * accum1; + y[row + 2 + yOffset] += alpha * accum2; + y[row + 3 + yOffset] += alpha * accum3; + } + for (; row < n; row++) { + FloatVector yv = FloatVector.zero(SSPECIES); + int col = 0; + for (; col < colUnrollLoopBound; col += SSPECIES.length() * 4) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, col + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, col + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 2) + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, col + (SSPECIES.length() * 3) + xOffset); + + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, col + SSPECIES.length() + row * lda + aOffset); + FloatVector av2 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 2) + row * lda + aOffset); + FloatVector av3 = FloatVector.fromArray( + SSPECIES, a, col + (SSPECIES.length() * 3) + row * lda + aOffset); + + yv = av0.fma(xv0, av1.fma(xv1, av2.fma(xv2, av3.fma(xv3, yv)))); + } + for (; col < colLoopBound; col += SSPECIES.length()) { + FloatVector xv = FloatVector.fromArray(SSPECIES, x, col + xOffset); + FloatVector av = FloatVector.fromArray(SSPECIES, a, col + row * lda + aOffset); + yv = xv.fma(av, yv); + } + float accum = yv.reduceLanes(VectorOperators.ADD); + for (; col < m; col++) { + accum += x[col + xOffset] * a[col + row * lda + aOffset]; + } + y[row + yOffset] += alpha * accum; + } + } + + private static void norSgemvT(int m, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + int incx, float[] y, int yOffset, int incy) { + int yIndex = incy > 0 ? 0 : (n - 1) * (-incy); + for (int row = 0; row < n; row++, yIndex += incy) { + float accum = 0.0f; + int xIndex = incx > 0 ? 0 : (m - 1) * (-incx); + for (int col = 0; col < m; col++, xIndex += incx) { + accum += a[col + row * lda + aOffset] * x[xIndex + xOffset]; + } + y[yIndex + yOffset] += alpha * accum; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java new file mode 100644 index 0000000000000000000000000000000000000000..1af0afc40d7747f8407a53a7f7ceb04781b2f359 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sger.java @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class Sger { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + private static final int UNROLL_SIZE = 4; + + public static void sger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda) { + BlasUtils.checkParameter("SGER", 1, m >= 0); + BlasUtils.checkParameter("SGER", 2, n >= 0); + BlasUtils.checkParameter("SGER", 5, incx != 0); + BlasUtils.checkParameter("SGER", 7, incy != 0); + BlasUtils.checkParameter("SGER", 9, lda >= Math.max(1, m)); + + if (m == 0 || n == 0 || BlasUtils.isZero(alpha)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (m - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (m - 1) + (n - 1) * lda, a.length); + + if (incx == 1 && incy == 1) { + vecSger(m, n, alpha, x, xOffset, y, yOffset, a, aOffset, lda); + } else { + normalSger(m, n, alpha, x, xOffset, incx, y, yOffset, incy, a, aOffset, lda); + } + } + + private static void vecSger(int m, int n, float alpha, float[] x, int xOffset, float[] y, int yOffset, float[] a, + int aOffset, int lda) { + int colLoopBound = loopBound(n, UNROLL_SIZE); + int rowLoopBound = loopBound(m, UNROLL_SIZE * SSPECIES.length()); + int col = 0; + for (; col < colLoopBound; col += UNROLL_SIZE) { + FloatVector alphaMulYv0 = FloatVector.broadcast(SSPECIES, alpha * y[col + yOffset]); + FloatVector alphaMulYv1 = FloatVector.broadcast(SSPECIES, alpha * y[col + 1 + yOffset]); + FloatVector alphaMulYv2 = FloatVector.broadcast(SSPECIES, alpha * y[col + 2 + yOffset]); + FloatVector alphaMulYv3 = FloatVector.broadcast(SSPECIES, alpha * y[col + 3 + yOffset]); + int row = 0; + for (; row < rowLoopBound; row += UNROLL_SIZE * SSPECIES.length()) { + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, row + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, row + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, row + 2 * SSPECIES.length() + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, row + 3 * SSPECIES.length() + xOffset); + + FloatVector av00 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av01 = FloatVector.fromArray(SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset); + FloatVector av02 = FloatVector.fromArray(SSPECIES, a, + row + 2 * SSPECIES.length() + col * lda + aOffset); + FloatVector av03 = FloatVector.fromArray(SSPECIES, a, + row + 3 * SSPECIES.length() + col * lda + aOffset); + + xv0.fma(alphaMulYv0, av00).intoArray(a, row + col * lda + aOffset); + xv1.fma(alphaMulYv0, av01).intoArray(a, row + SSPECIES.length() + col * lda + aOffset); + xv2.fma(alphaMulYv0, av02).intoArray(a, row + 2 * SSPECIES.length() + col * lda + aOffset); + xv3.fma(alphaMulYv0, av03).intoArray(a, row + 3 * SSPECIES.length() + col * lda + aOffset); + + FloatVector av10 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset); + FloatVector av11 = FloatVector.fromArray(SSPECIES, a, + row + SSPECIES.length() + (col + 1) * lda + aOffset); + FloatVector av12 = FloatVector.fromArray(SSPECIES, a, + row + 2 * SSPECIES.length() + (col + 1) * lda + aOffset); + FloatVector av13 = FloatVector.fromArray(SSPECIES, a, + row + 3 * SSPECIES.length() + (col + 1) * lda + aOffset); + + xv0.fma(alphaMulYv1, av10).intoArray(a, row + (col + 1) * lda + aOffset); + xv1.fma(alphaMulYv1, av11).intoArray(a, row + SSPECIES.length() + (col + 1) * lda + aOffset); + xv2.fma(alphaMulYv1, av12).intoArray(a, row + 2 * SSPECIES.length() + (col + 1) * lda + aOffset); + xv3.fma(alphaMulYv1, av13).intoArray(a, row + 3 * SSPECIES.length() + (col + 1) * lda + aOffset); + + FloatVector av20 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset); + FloatVector av21 = FloatVector.fromArray(SSPECIES, a, + row + SSPECIES.length() + (col + 2) * lda + aOffset); + FloatVector av22 = FloatVector.fromArray(SSPECIES, a, + row + 2 * SSPECIES.length() + (col + 2) * lda + aOffset); + FloatVector av23 = FloatVector.fromArray(SSPECIES, a, + row + 3 * SSPECIES.length() + (col + 2) * lda + aOffset); + + xv0.fma(alphaMulYv2, av20).intoArray(a, row + (col + 2) * lda + aOffset); + xv1.fma(alphaMulYv2, av21).intoArray(a, row + SSPECIES.length() + (col + 2) * lda + aOffset); + xv2.fma(alphaMulYv2, av22).intoArray(a, row + 2 * SSPECIES.length() + (col + 2) * lda + aOffset); + xv3.fma(alphaMulYv2, av23).intoArray(a, row + 3 * SSPECIES.length() + (col + 2) * lda + aOffset); + + FloatVector av30 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset); + FloatVector av31 = FloatVector.fromArray(SSPECIES, a, + row + SSPECIES.length() + (col + 3) * lda + aOffset); + FloatVector av32 = FloatVector.fromArray(SSPECIES, a, + row + 2 * SSPECIES.length() + (col + 3) * lda + aOffset); + FloatVector av33 = FloatVector.fromArray(SSPECIES, a, + row + 3 * SSPECIES.length() + (col + 3) * lda + aOffset); + + xv0.fma(alphaMulYv3, av30).intoArray(a, row + (col + 3) * lda + aOffset); + xv1.fma(alphaMulYv3, av31).intoArray(a, row + SSPECIES.length() + (col + 3) * lda + aOffset); + xv2.fma(alphaMulYv3, av32).intoArray(a, row + 2 * SSPECIES.length() + (col + 3) * lda + aOffset); + xv3.fma(alphaMulYv3, av33).intoArray(a, row + 3 * SSPECIES.length() + (col + 3) * lda + aOffset); + } + float alphaMulY0 = alpha * y[col + yOffset]; + float alphaMulY1 = alpha * y[col + 1 + yOffset]; + float alphaMulY2 = alpha * y[col + 2 + yOffset]; + float alphaMulY3 = alpha * y[col + 3 + yOffset]; + for (; row < m; row++) { + a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset]; + a[row + (col + 1) * lda + aOffset] += alphaMulY1 * x[row + xOffset]; + a[row + (col + 2) * lda + aOffset] += alphaMulY2 * x[row + xOffset]; + a[row + (col + 3) * lda + aOffset] += alphaMulY3 * x[row + xOffset]; + } + } + for (; col < n; col++) { + int row; + FloatVector alphaMulYv = FloatVector.broadcast(SSPECIES, alpha * y[col + yOffset]); + for (row = 0; row < rowLoopBound; row += UNROLL_SIZE * SSPECIES.length()) { + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + SSPECIES.length() + col * lda + aOffset); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + 2 * SSPECIES.length() + col * lda + aOffset); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + 3 * SSPECIES.length() + col * lda + aOffset); + + FloatVector xv0 = FloatVector.fromArray(SSPECIES, x, row + xOffset); + FloatVector xv1 = FloatVector.fromArray(SSPECIES, x, row + SSPECIES.length() + xOffset); + FloatVector xv2 = FloatVector.fromArray(SSPECIES, x, row + 2 * SSPECIES.length() + xOffset); + FloatVector xv3 = FloatVector.fromArray(SSPECIES, x, row + 3 * SSPECIES.length() + xOffset); + + xv0.fma(alphaMulYv, av0).intoArray(a, row + col * lda + aOffset); + xv1.fma(alphaMulYv, av1).intoArray(a, row + SSPECIES.length() + col * lda + aOffset); + xv2.fma(alphaMulYv, av2).intoArray(a, row + 2 * SSPECIES.length() + col * lda + aOffset); + xv3.fma(alphaMulYv, av3).intoArray(a, row + 3 * SSPECIES.length() + col * lda + aOffset); + } + float alphaMulY0 = alpha * y[col + yOffset]; + for (; row < m; row++) { + a[row + col * lda + aOffset] += alphaMulY0 * x[row + xOffset]; + } + } + } + + private static void normalSger(int m, int n, float alpha, float[] x, int xOffset, int incx, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda) { + int xStartIndx = incx > 0 ? 0 : -(m - 1) * incx; + int yStartIndx = incy > 0 ? 0 : -(n - 1) * incy; + + for (int j = 0; j < n; j++, yStartIndx += incy) { + if (!BlasUtils.isZero(y[yStartIndx + yOffset])) { + for (int i = 0, xIndx = xStartIndx; i < m; i++, xIndx += incx) { + a[i + j * lda + aOffset] += alpha * x[xIndx + xOffset] * y[yStartIndx + yOffset]; + } + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java new file mode 100644 index 0000000000000000000000000000000000000000..1394b92d3d2b8331e6773bbc4484e493bc9ad404 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspmv.java @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Sspmv { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void sspmv(String uplo, int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx, + float beta, float[] y, int yOffset, int incy) { + BlasUtils.checkParameter("SSPMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("SSPMV", 2, n >= 0); + BlasUtils.checkParameter("SSPMV", 6, incx != 0); + BlasUtils.checkParameter("SSPMV", 9, incy != 0); + + if (n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, a.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx); + int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy); + if (Float.compare(beta, 1.0f) != 0) { + SblasLevel2.sMulBeta(n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (uploFlag) { + if (incx == 1 && incy == 1) { + vecSspmvU(n, alpha, a, aOffset, x, xOffset, y, yOffset); + } else { + norSspmvU(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex); + } + } else { + if (incx == 1 && incy == 1) { + vecSspmvL(n, alpha, a, aOffset, x, xOffset, y, yOffset); + } else { + norSspmvL(n, alpha, a, aOffset, x, xOffset, incx, y, yOffset, incy, xStartIndex, yStartIndex); + } + } + } + + private static void vecSspmvU(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, float[] y, + int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + float alphaMulX0 = alpha * x[xOffset + col]; + float alphaMulX1 = alpha * x[xOffset + (col + 1)]; + float alphaMulX2 = alpha * x[xOffset + (col + 2)]; + float alphaMulX3 = alpha * x[xOffset + (col + 3)]; + FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1); + FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2); + FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3); + FloatVector accumv0 = FloatVector.zero(SSPECIES); + FloatVector accumv1 = FloatVector.zero(SSPECIES); + FloatVector accumv2 = FloatVector.zero(SSPECIES); + FloatVector accumv3 = FloatVector.zero(SSPECIES); + int row = 0; + for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) { + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * (col + 1) / 2); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 1) * ((col + 1) + 1) / 2); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 2) * ((col + 2) + 1) / 2); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 3) * ((col + 3) + 1) / 2); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + float accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD); + float accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD); + float accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + float a0 = a[aOffset + row + col * (col + 1) / 2]; + float a1 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2]; + float a2 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2]; + float a3 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2]; + float x0 = x[row + xOffset]; + y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset] + + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]); + accum0 += alpha * a0 * x0; + accum1 += alpha * a1 * x0; + accum2 += alpha * a2 * x0; + accum3 += alpha * a3 * x0; + } + float a00 = a[aOffset + row + col * (col + 1) / 2]; + float a01 = a[aOffset + row + (col + 1) * ((col + 1) + 1) / 2]; + float a02 = a[aOffset + row + (col + 2) * ((col + 2) + 1) / 2]; + float a03 = a[aOffset + row + (col + 3) * ((col + 3) + 1) / 2]; + float a11 = a[aOffset + (row + 1) + (col + 1) * ((col + 1) + 1) / 2]; + float a12 = a[aOffset + (row + 1) + (col + 2) * ((col + 2) + 1) / 2]; + float a13 = a[aOffset + (row + 1) + (col + 3) * ((col + 3) + 1) / 2]; + float a22 = a[aOffset + (row + 2) + (col + 2) * ((col + 2) + 1) / 2]; + float a23 = a[aOffset + (row + 2) + (col + 3) * ((col + 3) + 1) / 2]; + float a33 = a[aOffset + (row + 3) + (col + 3) * ((col + 3) + 1) / 2]; + y[yOffset + col] += alphaMulX0 * a00 + alphaMulX1 * a01 + alphaMulX2 * a02 + alphaMulX3 * a03 + accum0; + y[yOffset + (col + 1)] += alphaMulX0 * a01 + alphaMulX1 * a11 + alphaMulX2 * a12 + alphaMulX3 * a13 + + accum1; + y[yOffset + (col + 2)] += alphaMulX0 * a02 + alphaMulX1 * a12 + alphaMulX2 * a22 + alphaMulX3 * a23 + + accum2; + y[yOffset + (col + 3)] += alphaMulX0 * a03 + alphaMulX1 * a13 + alphaMulX2 * a23 + alphaMulX3 * a33 + + accum3; + } + for (; col < n; col += 1) { + float alphaMulX0 = alpha * x[xOffset + col]; + FloatVector accumv0 = FloatVector.zero(SSPECIES); + FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + int row = 0; + for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * (col + 1) / 2); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row); + av.fma(alphaMulXV0, yv).intoArray(y, yOffset + row); + accumv0 = av.fma(xv, accumv0); + } + float accum0 = accumv0.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + float a0 = a[aOffset + row + col * (col + 1) / 2]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += a[aOffset + row + col * (col + 1) / 2] * alphaMulX0 + alpha * accum0; + } + } + + private static void norSspmvU(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx, + float[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) { + int aIndx = 1; + for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) { + float alphaMulX = alpha * x[xIndx + xOffset]; + float accum = 0.0f; + + for (int row = aIndx, xi = xStartIndex, yi = yStartIndex; row < aIndx + col; row++, xi += incx, + yi += incy) { + y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset]; + accum += a[row - 1 + aOffset] * x[xi + xOffset]; + } + + y[yIndx + yOffset] = y[yIndx + yOffset] + alphaMulX * a[aIndx + col - 1 + aOffset] + alpha * accum; + aIndx += col + 1; + } + } + + private static void vecSspmvL(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, float[] y, + int yOffset) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + int row = col; + float alphaMulX0 = alpha * x[xOffset + col]; + float alphaMulX1 = alpha * x[xOffset + (col + 1)]; + float alphaMulX2 = alpha * x[xOffset + (col + 2)]; + float alphaMulX3 = alpha * x[xOffset + (col + 3)]; + FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1); + FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2); + FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3); + float a00 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + float a10 = a[aOffset + (row + 1) - col * (col + 1) / 2 + n * col]; + float a20 = a[aOffset + (row + 2) - col * (col + 1) / 2 + n * col]; + float a30 = a[aOffset + (row + 3) - col * (col + 1) / 2 + n * col]; + float a11 = a[aOffset + (row + 1) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + float a21 = a[aOffset + (row + 2) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + float a31 = a[aOffset + (row + 3) - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + float a22 = a[aOffset + (row + 2) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + float a32 = a[aOffset + (row + 3) - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + float a33 = a[aOffset + (row + 3) - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)]; + float accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30; + float accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31; + float accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32; + float accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33; + FloatVector accumv0 = FloatVector.zero(SSPECIES); + FloatVector accumv1 = FloatVector.zero(SSPECIES); + FloatVector accumv2 = FloatVector.zero(SSPECIES); + FloatVector accumv3 = FloatVector.zero(SSPECIES); + row += 4; + for (; row <= (n - n % SSPECIES.length() - SSPECIES.length()); row += SSPECIES.length()) { + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row - col * (col + 1) / 2 + n * col); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, + aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, + aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, + aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD); + accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD); + accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD); + accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < n; row += 1) { + float a0 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + float a1 = a[aOffset + row - (col + 1) * ((col + 1) + 1) / 2 + n * (col + 1)]; + float a2 = a[aOffset + row - (col + 2) * ((col + 2) + 1) / 2 + n * (col + 2)]; + float a3 = a[aOffset + row - (col + 3) * ((col + 3) + 1) / 2 + n * (col + 3)]; + y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3; + accum0 += alpha * x[xOffset + row] * a0; + accum1 += alpha * x[xOffset + row] * a1; + accum2 += alpha * x[xOffset + row] * a2; + accum3 += alpha * x[xOffset + row] * a3; + } + y[yOffset + col] += accum0; + y[yOffset + (col + 1)] += accum1; + y[yOffset + (col + 2)] += accum2; + y[yOffset + (col + 3)] += accum3; + } + for (; col < n; col += 1) { + float alphaMulX0 = alpha * x[xOffset + col]; + y[yOffset + col] += a[aOffset + col - col * (col + 1) / 2 + n * col] * alphaMulX0; + int row = col + 1; + float accum0 = 0.0f; + for (; row < n; row++) { + float a0 = a[aOffset + row - col * (col + 1) / 2 + n * col]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += alpha * accum0; + } + } + + private static void norSspmvL(int n, float alpha, float[] a, int aOffset, float[] x, int xOffset, int incx, + float[] y, int yOffset, int incy, int xStartIndex, int yStartIndex) { + int aIndx = 1; + for (int col = 0, xIndx = xStartIndex, yIndx = yStartIndex; col < n; col++, xIndx += incx, yIndx += incy) { + float alphaMulX = alpha * x[xIndx + xOffset]; + float accum = 0.0f; + y[yIndx + yOffset] += alphaMulX * a[aIndx - 1 + aOffset]; + + for (int row = aIndx + 1, xi = xIndx + incx, yi = yIndx + incy; row < aIndx + n - col; row++, xi += incx, + yi += incy) { + y[yi + yOffset] += alphaMulX * a[row - 1 + aOffset]; + accum += a[row - 1 + aOffset] * x[xi + xOffset]; + } + y[yIndx + yOffset] += alpha * accum; + aIndx += n - col; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java new file mode 100644 index 0000000000000000000000000000000000000000..b229bbeb4a4faaab077d21ab4952c2a04fbe6efd --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Sspr.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import com.huawei.vectorblas.blas1.singleprecision.Saxpy; +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Sspr { + public static void sspr(String uplo, int n, float alpha, float[] x, int xOffset, int incx, float[] ap, + int aOffset) { + BlasUtils.checkParameter("SSPR", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("SSPR", 2, n >= 0); + BlasUtils.checkParameter("SSPR", 5, incx != 0); + + if (n == 0 || BlasUtils.isZero(alpha)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("a", aOffset, (1 + n) * n / 2 - 1, ap.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndx = 0; + if (incx <= 0) { + xStartIndx = -(n - 1) * incx; + } + + int cnt = 0; + if (incx >= 0) { + for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) { + int colCnt = uploFlag ? j + 1 : n - j; + if (!BlasUtils.isZero(x[xIndx + xOffset])) { + int kIndx = uploFlag ? 0 : xIndx; + Saxpy.saxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1); + } + cnt += colCnt; + } + } else { + for (int j = 0, xIndx = xStartIndx; j < n; j++, xIndx += incx) { + int colCnt = uploFlag ? j + 1 : n - j; + if (!BlasUtils.isZero(x[xIndx + xOffset])) { + int kIndx = uploFlag ? xIndx : 0; + Saxpy.saxpy(colCnt, alpha * x[xIndx + xOffset], x, xOffset + kIndx, incx, ap, aOffset + cnt, 1); + } + cnt += colCnt; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java new file mode 100644 index 0000000000000000000000000000000000000000..2af10f8ce7d6f4f0fcdd59ceb15e86a6056aa17a --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas2/singleprecision/Ssymv.java @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas2.singleprecision; + +import static com.huawei.vectorblas.utils.ArrayUtil.loopBound; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +public class Ssymv { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + + public static void ssymv(String uplo, int n, float alpha, float[] a, int aOffset, int lda, float[] x, int xOffset, + int incx, float beta, float[] y, int yOffset, int incy) { + BlasUtils.checkParameter("SSYMV", 1, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + BlasUtils.checkParameter("SSYMV", 2, n >= 0); + BlasUtils.checkParameter("SSYMV", 5, lda >= Math.max(1, n)); + BlasUtils.checkParameter("SSYMV", 7, incx != 0); + BlasUtils.checkParameter("SSYMV", 10, incy != 0); + + if (n == 0 || (BlasUtils.isZero(alpha) && Float.compare(beta, 1.0f) == 0)) { + return; + } + + BlasUtils.checkBlasArray("x", xOffset, Math.abs(incx) * (n - 1), x.length); + BlasUtils.checkBlasArray("y", yOffset, Math.abs(incy) * (n - 1), y.length); + BlasUtils.checkBlasArray("a", aOffset, (n - 1) + (n - 1) * lda, a.length); + + boolean uploFlag = Lsame.lsame(uplo, "U"); + int xStartIndex = incx > 0 ? 0 : (n - 1) * (-incx); + int yStartIndex = incy > 0 ? 0 : (n - 1) * (-incy); + if (Float.compare(beta, 1.0f) != 0) { + SblasLevel2.sMulBeta(n, beta, y, yOffset, incy); + } + if (BlasUtils.isZero(alpha)) { + return; + } + if (uploFlag) { + if (incx == 1 && incy == 1) { + vecSsymvU(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda); + } else { + norSsymvU(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex); + } + } else if (incx == 1 && incy == 1) { + vecSsymvL(n, x, xOffset, alpha, y, yOffset, a, aOffset, lda); + } else { + norSsymvL(n, x, xOffset, incx, alpha, y, yOffset, incy, a, aOffset, lda, xStartIndex, yStartIndex); + } + } + + private static void vecSsymvU(int n, float[] x, int xOffset, float alpha, float[] y, int yOffset, float[] a, + int aOffset, int lda) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + float alphaMulX0 = alpha * x[col + xOffset]; + float alphaMulX1 = alpha * x[(col + 1) + xOffset]; + float alphaMulX2 = alpha * x[(col + 2) + xOffset]; + float alphaMulX3 = alpha * x[(col + 3) + xOffset]; + FloatVector alphaXv0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + FloatVector alphaXv1 = FloatVector.broadcast(SSPECIES, alphaMulX1); + FloatVector alphaXv2 = FloatVector.broadcast(SSPECIES, alphaMulX2); + FloatVector alphaXv3 = FloatVector.broadcast(SSPECIES, alphaMulX3); + FloatVector accumv0 = FloatVector.zero(SSPECIES); + FloatVector accumv1 = FloatVector.zero(SSPECIES); + FloatVector accumv2 = FloatVector.zero(SSPECIES); + FloatVector accumv3 = FloatVector.zero(SSPECIES); + int row = 0; + for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) { + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, row + (col + 1) * lda + aOffset); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, row + (col + 2) * lda + aOffset); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, row + (col + 3) * lda + aOffset); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, row + xOffset); + yv = av0.fma(alphaXv0, yv); + yv = av1.fma(alphaXv1, yv); + yv = av2.fma(alphaXv2, yv); + av3.fma(alphaXv3, yv).intoArray(y, row + yOffset); + accumv0 = av0.fma(xv, accumv0); + accumv1 = av1.fma(xv, accumv1); + accumv2 = av2.fma(xv, accumv2); + accumv3 = av3.fma(xv, accumv3); + } + float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + float accum1 = alpha * accumv1.reduceLanes(VectorOperators.ADD); + float accum2 = alpha * accumv2.reduceLanes(VectorOperators.ADD); + float accum3 = alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + float a0 = a[row + col * lda + aOffset]; + float a1 = a[row + (col + 1) * lda + aOffset]; + float a2 = a[row + (col + 2) * lda + aOffset]; + float a3 = a[row + (col + 3) * lda + aOffset]; + float x0 = x[row + xOffset]; + y[row + yOffset] += alpha * (a0 * x[col + xOffset] + a1 * x[(col + 1) + xOffset] + + a2 * x[(col + 2) + xOffset] + a3 * x[(col + 3) + xOffset]); + accum0 += alpha * a0 * x0; + accum1 += alpha * a1 * x0; + accum2 += alpha * a2 * x0; + accum3 += alpha * a3 * x0; + } + float a00 = a[row + col * lda + aOffset]; + float a01 = a[row + (col + 1) * lda + aOffset]; + float a02 = a[row + (col + 2) * lda + aOffset]; + float a03 = a[row + (col + 3) * lda + aOffset]; + float a11 = a[(row + 1) + (col + 1) * lda + aOffset]; + float a12 = a[(row + 1) + (col + 2) * lda + aOffset]; + float a13 = a[(row + 1) + (col + 3) * lda + aOffset]; + float a22 = a[(row + 2) + (col + 2) * lda + aOffset]; + float a23 = a[(row + 2) + (col + 3) * lda + aOffset]; + float a33 = a[(row + 3) + (col + 3) * lda + aOffset]; + y[col + yOffset] += a00 * alphaMulX0 + a01 * alphaMulX1 + a02 * alphaMulX2 + a03 * alphaMulX3 + accum0; + y[(col + 1) + yOffset] += a01 * alphaMulX0 + a11 * alphaMulX1 + a12 * alphaMulX2 + a13 * alphaMulX3 + + accum1; + y[(col + 2) + yOffset] += a02 * alphaMulX0 + a12 * alphaMulX1 + a22 * alphaMulX2 + a23 * alphaMulX3 + + accum2; + y[(col + 3) + yOffset] += a03 * alphaMulX0 + a13 * alphaMulX1 + a23 * alphaMulX2 + a33 * alphaMulX3 + + accum3; + } + for (; col < n; col++) { + float alphaMulX0 = alpha * x[col + xOffset]; + FloatVector alphaXv0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + FloatVector accumv0 = FloatVector.zero(SSPECIES); + int row = 0; + for (; row < col - col % SSPECIES.length(); row += SSPECIES.length()) { + FloatVector av = FloatVector.fromArray(SSPECIES, a, row + col * lda + aOffset); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, row + yOffset); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, row + xOffset); + av.fma(alphaXv0, yv).intoArray(y, row + yOffset); + accumv0 = av.fma(xv, accumv0); + } + float accum0 = alpha * accumv0.reduceLanes(VectorOperators.ADD); + for (; row < col; row++) { + float a0 = a[row + col * lda + aOffset]; + y[row + yOffset] += a0 * alphaMulX0; + accum0 += alpha * a0 * x[row + xOffset]; + } + y[col + yOffset] += a[row + col * lda + aOffset] * alphaMulX0 + accum0; + } + } + + private static void norSsymvU(int n, float[] x, int xOffset, int incx, float alpha, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) { + for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) { + float alphaMulX = alpha * x[xj + xOffset]; + float accum = 0.0f; + + for (int row = 0, xIndx = xStartIndex, yIndx = yStartIndex; row < col; row++, xIndx += incx, + yIndx += incy) { + y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + accum += a[row + col * lda + aOffset] * x[xIndx + xOffset]; + } + y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset] + alpha * accum; + } + } + + private static void vecSsymvL(int n, float[] x, int xOffset, float alpha, float[] y, int yOffset, float[] a, + int aOffset, int lda) { + int col = 0; + int colLoopBound = loopBound(n, 4); + for (; col < colLoopBound; col += 4) { // 4 is unroll size for column + int row = col; + float a00 = a[aOffset + row + col * lda]; + float a10 = a[aOffset + (row + 1) + col * lda]; + float a20 = a[aOffset + (row + 2) + col * lda]; + float a30 = a[aOffset + (row + 3) + col * lda]; + float a11 = a[aOffset + (row + 1) + (col + 1) * lda]; + float a21 = a[aOffset + (row + 2) + (col + 1) * lda]; + float a31 = a[aOffset + (row + 3) + (col + 1) * lda]; + float a22 = a[aOffset + (row + 2) + (col + 2) * lda]; + float a32 = a[aOffset + (row + 3) + (col + 2) * lda]; + float a33 = a[aOffset + (row + 3) + (col + 3) * lda]; + float alphaMulX0 = alpha * x[xOffset + col]; + float alphaMulX1 = alpha * x[xOffset + (col + 1)]; + float alphaMulX2 = alpha * x[xOffset + (col + 2)]; + float alphaMulX3 = alpha * x[xOffset + (col + 3)]; + float accum0 = alphaMulX0 * a00 + alphaMulX1 * a10 + alphaMulX2 * a20 + alphaMulX3 * a30; + float accum1 = alphaMulX0 * a10 + alphaMulX1 * a11 + alphaMulX2 * a21 + alphaMulX3 * a31; + float accum2 = alphaMulX0 * a20 + alphaMulX1 * a21 + alphaMulX2 * a22 + alphaMulX3 * a32; + float accum3 = alphaMulX0 * a30 + alphaMulX1 * a31 + alphaMulX2 * a32 + alphaMulX3 * a33; + FloatVector alphaMulXV0 = FloatVector.broadcast(SSPECIES, alphaMulX0); + FloatVector alphaMulXV1 = FloatVector.broadcast(SSPECIES, alphaMulX1); + FloatVector alphaMulXV2 = FloatVector.broadcast(SSPECIES, alphaMulX2); + FloatVector alphaMulXV3 = FloatVector.broadcast(SSPECIES, alphaMulX3); + FloatVector accumv0 = FloatVector.zero(SSPECIES); + FloatVector accumv1 = FloatVector.zero(SSPECIES); + FloatVector accumv2 = FloatVector.zero(SSPECIES); + FloatVector accumv3 = FloatVector.zero(SSPECIES); + row += 4; + for (; row <= (n - n % SSPECIES.length() - SSPECIES.length()); row += SSPECIES.length()) { + FloatVector av0 = FloatVector.fromArray(SSPECIES, a, aOffset + row + col * lda); + FloatVector av1 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 1) * lda); + FloatVector av2 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 2) * lda); + FloatVector av3 = FloatVector.fromArray(SSPECIES, a, aOffset + row + (col + 3) * lda); + FloatVector yv = FloatVector.fromArray(SSPECIES, y, yOffset + row); + FloatVector xv = FloatVector.fromArray(SSPECIES, x, xOffset + row); + yv = alphaMulXV0.fma(av0, yv); + yv = alphaMulXV1.fma(av1, yv); + yv = alphaMulXV2.fma(av2, yv); + alphaMulXV3.fma(av3, yv).intoArray(y, yOffset + row); + accumv0 = xv.fma(av0, accumv0); + accumv1 = xv.fma(av1, accumv1); + accumv2 = xv.fma(av2, accumv2); + accumv3 = xv.fma(av3, accumv3); + } + accum0 += alpha * accumv0.reduceLanes(VectorOperators.ADD); + accum1 += alpha * accumv1.reduceLanes(VectorOperators.ADD); + accum2 += alpha * accumv2.reduceLanes(VectorOperators.ADD); + accum3 += alpha * accumv3.reduceLanes(VectorOperators.ADD); + for (; row < n; row += 1) { + float a0 = a[aOffset + row + col * lda]; + float a1 = a[aOffset + row + (col + 1) * lda]; + float a2 = a[aOffset + row + (col + 2) * lda]; + float a3 = a[aOffset + row + (col + 3) * lda]; + y[yOffset + row] += alphaMulX0 * a0 + alphaMulX1 * a1 + alphaMulX2 * a2 + alphaMulX3 * a3; + accum0 += alpha * x[xOffset + row] * a0; + accum1 += alpha * x[xOffset + row] * a1; + accum2 += alpha * x[xOffset + row] * a2; + accum3 += alpha * x[xOffset + row] * a3; + } + y[yOffset + col] += accum0; + y[yOffset + (col + 1)] += accum1; + y[yOffset + (col + 2)] += accum2; + y[yOffset + (col + 3)] += accum3; + } + for (; col < n; col += 1) { + float alphaMulX0 = alpha * x[xOffset + col]; + y[yOffset + col] += a[aOffset + col + col * lda] * alphaMulX0; + int row = col + 1; + float accum0 = 0.0f; + for (; row < n; row++) { + float a0 = a[aOffset + row + col * lda]; + y[yOffset + row] += a0 * alphaMulX0; + accum0 += x[xOffset + row] * a0; + } + y[yOffset + col] += alpha * accum0; + } + } + + private static void norSsymvL(int n, float[] x, int xOffset, int incx, float alpha, float[] y, int yOffset, + int incy, float[] a, int aOffset, int lda, int xStartIndex, int yStartIndex) { + for (int col = 0, xj = xStartIndex, yj = yStartIndex; col < n; col++, xj += incx, yj += incy) { + float alphaMulX = alpha * x[xj + xOffset]; + y[yj + yOffset] += alphaMulX * a[col + col * lda + aOffset]; + float accum = 0.0f; + + for (int row = col + 1, xIndx = xj + incx, yIndx = yj + incy; row < n; row++, xIndx += incx, + yIndx += incy) { + y[yIndx + yOffset] += alphaMulX * a[row + col * lda + aOffset]; + accum += a[row + col * lda + aOffset] * x[xIndx + xOffset]; + } + y[yj + yOffset] += alpha * accum; + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java new file mode 100644 index 0000000000000000000000000000000000000000..df6ab57a6a09bbce340db416cf1bce0057c7107d --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/DblasLevel3.java @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.doubleprecision; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorSpecies; + +public class DblasLevel3 { + private static final VectorSpecies DSPECIES = DoubleVector.SPECIES_MAX; + protected static final int DGEMM_P = 256; // Blocking size for m direction. + protected static final int DGEMM_Q = 240; // Blocking size for k direction. + protected static final int DGEMM_R = 8192; // Blocking size for n direction. + protected static final int VECTOR_LENGTH = DSPECIES.length(); + protected static final int VECTOR_LENGTH2 = 2 * VECTOR_LENGTH; // 2 times vector length + protected static final int VECTOR_LENGTH3 = 3 * VECTOR_LENGTH; // 3 times vector length + protected static final int VECTOR_LENGTH4 = 4 * VECTOR_LENGTH; // 4 times vector length + protected static final int DGEMM_UNROLL_M = 4 * VECTOR_LENGTH; // Kernel size for m is 4 * DSPECIES.length(). + protected static final int DGEMM_UNROLL_N = 4; // Kernel size for n direction is 4. + + protected static void betaMulC(int sizeM, int sizeN, double beta, double[] dc, int cOffset, int ldc) { + DoubleVector betav = DoubleVector.broadcast(DSPECIES, beta); + for (int col = 0; col < sizeN; col++) { + int row = 0; + for (; row < sizeM - VECTOR_LENGTH; row += VECTOR_LENGTH) { + DoubleVector cv = DoubleVector.fromArray(DSPECIES, dc, row + col * ldc + cOffset); + cv.mul(betav).intoArray(dc, row + col * ldc + cOffset); + } + for (; row < sizeM; row++) { + dc[row + col * ldc + cOffset] *= beta; + } + } + } + + protected static void kernelOperation8x4(int mc, int nc, int kc, double alpha, double[] da, double[] db, + int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) { + kernelOperation8x4Main(mc, nc, kc, alpha, da, db, bOffset, dc, ldc, cOffset, csRow, csCol); + kernelOperation8x4NBorder(mc, nc, kc, alpha, da, db, bOffset, dc, ldc, cOffset, csRow, csCol); + } + + private static void kernelOperation8x4NBorder(int mc, int nc, int kc, double alpha, double[] da, double[] db, + int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) { + DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha); + int cCol = csCol + (nc / DGEMM_UNROLL_N) * DGEMM_UNROLL_N; + int countJ = nc % DGEMM_UNROLL_N; + for (; countJ > 0; countJ--) { + int cRow = csRow; + int aIndx = 0; + int countI = mc / DGEMM_UNROLL_M; + for (; countI > 0; countI--) { + int bIndx = (nc - countJ) * kc; + DoubleVector c00 = DoubleVector.zero(DSPECIES); + DoubleVector c10 = DoubleVector.zero(DSPECIES); + DoubleVector c20 = DoubleVector.zero(DSPECIES); + DoubleVector c30 = DoubleVector.zero(DSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH); + DoubleVector a2 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH2); + DoubleVector a3 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH3); + + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c20 = a2.fma(b0, c20); + c30 = a3.fma(b0, c30); + + aIndx += DGEMM_UNROLL_M; + bIndx += 1; + } + alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + alphaVec.fma(c20, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2 + + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc); + alphaVec.fma(c30, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3 + + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc); + + cRow += DGEMM_UNROLL_M; + } + countI = mc % DGEMM_UNROLL_M; + if (countI >= VECTOR_LENGTH2) { + int bIndx = (nc - countJ) * kc; + DoubleVector c00 = DoubleVector.zero(DSPECIES); + DoubleVector c10 = DoubleVector.zero(DSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH); + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + + aIndx += VECTOR_LENGTH2; + bIndx += 1; + } + alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + + cRow += VECTOR_LENGTH2; + countI -= VECTOR_LENGTH2; + } + if (countI >= VECTOR_LENGTH) { + int bIndx = (nc - countJ) * kc; + DoubleVector c00 = DoubleVector.zero(DSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + c00 = a0.fma(b0, c00); + aIndx += VECTOR_LENGTH; + bIndx += 1; + } + alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc, + cOffset + cRow + cCol * ldc); + + cRow += VECTOR_LENGTH; + countI -= VECTOR_LENGTH; + } + while (countI > 0) { + int bIndx = (nc - countJ) * kc; + double[] cTmp = new double[1]; + int countL = kc; + for (; countL > 0; countL--) { + cTmp[0] += da[aIndx] * db[bIndx]; + aIndx += 1; + bIndx += 1; + } + dc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0]; + + cRow += 1; + countI -= 1; + } + cCol += 1; + } + } + + private static void kernelOperation8x4Main(int mc, int nc, int kc, double alpha, double[] da, double[] db, + int bOffset, double[] dc, int ldc, int cOffset, int csRow, int csCol) { + DoubleVector alphaVec = DoubleVector.broadcast(DSPECIES, alpha); + int countJ = nc / DGEMM_UNROLL_N; + int cCol = csCol; + for (; countJ > 0; countJ--) { + int cRow = csRow; + int aIndx = 0; + int countI = mc / DGEMM_UNROLL_M; + for (; countI > 0; countI--) { + DoubleVector c00 = DoubleVector.zero(DSPECIES); + DoubleVector c10 = DoubleVector.zero(DSPECIES); + DoubleVector c20 = DoubleVector.zero(DSPECIES); + DoubleVector c30 = DoubleVector.zero(DSPECIES); + DoubleVector c01 = DoubleVector.zero(DSPECIES); + DoubleVector c11 = DoubleVector.zero(DSPECIES); + DoubleVector c21 = DoubleVector.zero(DSPECIES); + DoubleVector c31 = DoubleVector.zero(DSPECIES); + DoubleVector c02 = DoubleVector.zero(DSPECIES); + DoubleVector c12 = DoubleVector.zero(DSPECIES); + DoubleVector c22 = DoubleVector.zero(DSPECIES); + DoubleVector c32 = DoubleVector.zero(DSPECIES); + DoubleVector c03 = DoubleVector.zero(DSPECIES); + DoubleVector c13 = DoubleVector.zero(DSPECIES); + DoubleVector c23 = DoubleVector.zero(DSPECIES); + DoubleVector c33 = DoubleVector.zero(DSPECIES); + int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc; + + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH); + DoubleVector a2 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH2); + DoubleVector a3 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH3); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c20 = a2.fma(b0, c20); + c30 = a3.fma(b0, c30); + + DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]); + c01 = a0.fma(b1, c01); + c11 = a1.fma(b1, c11); + c21 = a2.fma(b1, c21); + c31 = a3.fma(b1, c31); + + DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]); + c02 = a0.fma(b2, c02); + c12 = a1.fma(b2, c12); + c22 = a2.fma(b2, c22); + c32 = a3.fma(b2, c32); + + DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]); + c03 = a0.fma(b3, c03); + c13 = a1.fma(b3, c13); + c23 = a2.fma(b3, c23); + c33 = a3.fma(b3, c33); + aIndx += DGEMM_UNROLL_M; + bIndx += DGEMM_UNROLL_N; + } + DoubleVector cOri00 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc); + DoubleVector cOri10 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc); + DoubleVector cOri20 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2 + + cCol * ldc); + DoubleVector cOri30 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3 + + cCol * ldc); + + cOri00 = alphaVec.fma(c00, cOri00); + cOri10 = alphaVec.fma(c10, cOri10); + cOri20 = alphaVec.fma(c20, cOri20); + cOri30 = alphaVec.fma(c30, cOri30); + + cOri00.intoArray(dc, cOffset + cRow + cCol * ldc); + cOri10.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + cOri20.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc); + cOri30.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc); + + DoubleVector cOri01 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc); + DoubleVector cOri11 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 1) * ldc); + DoubleVector cOri21 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 1) * ldc); + DoubleVector cOri31 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 1) * ldc); + + cOri01 = alphaVec.fma(c01, cOri01); + cOri11 = alphaVec.fma(c11, cOri11); + cOri21 = alphaVec.fma(c21, cOri21); + cOri31 = alphaVec.fma(c31, cOri31); + + cOri01.intoArray(dc, cOffset + cRow + (cCol + 1) * ldc); + cOri11.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc); + cOri21.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 1) * ldc); + cOri31.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 1) * ldc); + + DoubleVector cOri02 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc); + DoubleVector cOri12 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 2) * ldc); + DoubleVector cOri22 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 2) * ldc); + DoubleVector cOri32 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 2) * ldc); + + cOri02 = alphaVec.fma(c02, cOri02); + cOri12 = alphaVec.fma(c12, cOri12); + cOri22 = alphaVec.fma(c22, cOri22); + cOri32 = alphaVec.fma(c32, cOri32); + + cOri02.intoArray(dc, cOffset + cRow + (cCol + 2) * ldc); + cOri12.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc); + cOri22.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 2) * ldc); + cOri32.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 2) * ldc); + + DoubleVector cOri03 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc); + DoubleVector cOri13 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 3) * ldc); + DoubleVector cOri23 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 3) * ldc); + DoubleVector cOri33 = DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 3) * ldc); + + cOri03 = alphaVec.fma(c03, cOri03); + cOri13 = alphaVec.fma(c13, cOri13); + cOri23 = alphaVec.fma(c23, cOri23); + cOri33 = alphaVec.fma(c33, cOri33); + + cOri03.intoArray(dc, cOffset + cRow + (cCol + 3) * ldc); + cOri13.intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc); + cOri23.intoArray(dc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 3) * ldc); + cOri33.intoArray(dc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 3) * ldc); + + cRow += DGEMM_UNROLL_M; + } + countI = mc % DGEMM_UNROLL_M; + if (countI >= VECTOR_LENGTH2) { + int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc; + DoubleVector c00 = DoubleVector.zero(DSPECIES); + DoubleVector c01 = DoubleVector.zero(DSPECIES); + DoubleVector c02 = DoubleVector.zero(DSPECIES); + DoubleVector c03 = DoubleVector.zero(DSPECIES); + DoubleVector c10 = DoubleVector.zero(DSPECIES); + DoubleVector c11 = DoubleVector.zero(DSPECIES); + DoubleVector c12 = DoubleVector.zero(DSPECIES); + DoubleVector c13 = DoubleVector.zero(DSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + DoubleVector a1 = DoubleVector.fromArray(DSPECIES, da, aIndx + VECTOR_LENGTH); + + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]); + DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]); + DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c01 = a0.fma(b1, c01); + c11 = a1.fma(b1, c11); + + c02 = a0.fma(b2, c02); + c12 = a1.fma(b2, c12); + c03 = a0.fma(b3, c03); + c13 = a1.fma(b3, c13); + + aIndx += VECTOR_LENGTH2; + bIndx += DGEMM_UNROLL_N; + } + alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + + alphaVec.fma(c01, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 1) * ldc); + alphaVec.fma(c11, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 1) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc); + + alphaVec.fma(c02, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 2) * ldc); + alphaVec.fma(c12, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 2) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc); + + alphaVec.fma(c03, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 3) * ldc); + alphaVec.fma(c13, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 3) * ldc)).intoArray(dc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc); + + cRow += VECTOR_LENGTH2; + countI -= VECTOR_LENGTH2; + } + if (countI >= VECTOR_LENGTH) { + int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc; + DoubleVector c00 = DoubleVector.zero(DSPECIES); + DoubleVector c01 = DoubleVector.zero(DSPECIES); + DoubleVector c02 = DoubleVector.zero(DSPECIES); + DoubleVector c03 = DoubleVector.zero(DSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + DoubleVector a0 = DoubleVector.fromArray(DSPECIES, da, aIndx); + + DoubleVector b0 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx]); + DoubleVector b1 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 1]); + DoubleVector b2 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 2]); + DoubleVector b3 = DoubleVector.broadcast(DSPECIES, db[bOffset + bIndx + 3]); + + c00 = a0.fma(b0, c00); + c01 = a0.fma(b1, c01); + c02 = a0.fma(b2, c02); + c03 = a0.fma(b3, c03); + + aIndx += VECTOR_LENGTH; + bIndx += DGEMM_UNROLL_N; + } + alphaVec.fma(c00, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + cCol * ldc)).intoArray(dc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c01, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 1) * ldc); + alphaVec.fma(c02, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 2) * ldc); + alphaVec.fma(c03, DoubleVector.fromArray(DSPECIES, dc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(dc, + cOffset + cRow + (cCol + 3) * ldc); + + cRow += VECTOR_LENGTH; + countI -= VECTOR_LENGTH; + } + while (countI > 0) { + int bIndx = (nc / DGEMM_UNROLL_N - countJ) * DGEMM_UNROLL_N * kc; + double[] cTmp = new double[DGEMM_UNROLL_N]; + int countL = kc; + for (; countL > 0; countL--) { + cTmp[0] += da[aIndx] * db[bOffset + bIndx]; + cTmp[1] += da[aIndx] * db[bOffset + bIndx + 1]; + cTmp[2] += da[aIndx] * db[bOffset + bIndx + 2]; + cTmp[3] += da[aIndx] * db[bOffset + bIndx + 3]; + aIndx += 1; + bIndx += DGEMM_UNROLL_N; + } + dc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0]; + dc[cOffset + cRow + (cCol + 1) * ldc] += alpha * cTmp[1]; + dc[cOffset + cRow + (cCol + 2) * ldc] += alpha * cTmp[2]; + dc[cOffset + cRow + (cCol + 3) * ldc] += alpha * cTmp[3]; + + cRow += 1; + countI -= 1; + } + cCol += DGEMM_UNROLL_N; + } + } + + /** + * onCopy is used for normally packing matrix in the right. + * For example, when DGEMM_UNROLL_N = 4, + * before packing after packing + * 1 6 11 16 21 1 2 3 4 21 + * 2 7 12 17 22 ---> 5 6 7 8 22 + * 3 8 13 18 23 9 10 11 12 23 + * 4 9 14 19 24 13 14 15 16 24 + * 5 10 15 20 25 17 18 19 20 25 + */ + protected static void onCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + double[] dst, int dstOffset) { + int col = 0; + int colPackSize = DGEMM_UNROLL_N; + int dstIndex = 0; + for (; col < sizeN - sizeN % colPackSize; col += colPackSize) { + int row = 0; + for (; row < sizeM; row += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 1] = src[(srcRow + row) + (srcCol + (col + 1)) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 2] = src[(srcRow + row) + (srcCol + (col + 2)) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 3] = src[(srcRow + row) + (srcCol + (col + 3)) * srcLd + srcOffset]; + dstIndex += colPackSize; + } + } + for (; col < sizeN; col += 1) { + int row = 0; + for (; row < sizeM; row += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dstIndex += 1; + } + } + } + + /** + * itCopy is used for transpose packing matrix in the left. + * For example, when DGEMM_UNROLL_M = 4, + * before packing after packing + * 1 6 11 16 21 1 5 9 13 17 + * 2 7 12 17 22 ---> 2 6 10 14 18 + * 3 8 13 18 23 3 7 11 15 19 + * 4 9 14 19 24 4 8 12 16 20 + * 5 10 15 20 25 21 22 23 24 25 + */ + protected static void itCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + double[] dst, int dstOffset) { + int row = 0; + int dstIndex = 0; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (row + vectorLen <= sizeM) { + int col = 0; + for (; col < sizeN; col++) { + System.arraycopy(src, srcOffset + (srcRow + row) + (srcCol + col) * srcLd, dst, + dstOffset + dstIndex, vectorLen); + dstIndex += vectorLen; + } + row += vectorLen; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java new file mode 100644 index 0000000000000000000000000000000000000000..4cae10440b3b418283c69c7782853c22c7b78475 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dgemm.java @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.doubleprecision; + +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_P; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_Q; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_R; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_UNROLL_N; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH2; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH4; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Dgemm { + public static void dgemm(String transa, String transb, int m, int n, int k, double alpha, double[] a, int aOffset, + int lda, double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + BlasUtils.checkParameter("DGEMM", 1, Lsame.lsame(transa, "N") || Lsame.lsame(transa, "T")); + BlasUtils.checkParameter("DGEMM", 2, Lsame.lsame(transb, "N") || Lsame.lsame(transb, "T")); + boolean transaFlag = Lsame.lsame(transa, "N"); + boolean transbFlag = Lsame.lsame(transb, "N"); + BlasUtils.checkParameter("DGEMM", 3, m >= 0); + BlasUtils.checkParameter("DGEMM", 4, n >= 0); + BlasUtils.checkParameter("DGEMM", 5, k >= 0); + BlasUtils.checkParameter("DGEMM", 8, lda >= Math.max(1, (transaFlag ? m : k))); + BlasUtils.checkParameter("DGEMM", 10, ldb >= Math.max(1, (transbFlag ? k : n))); + BlasUtils.checkParameter("DGEMM", 13, ldc >= Math.max(1, m)); + + if (m == 0 || n == 0) { + return; + } + if (Double.compare(beta, 1.0d) != 0) { + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + DblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc); + } + if (BlasUtils.isZero(alpha) || k == 0) { + return; + } + BlasUtils.checkBlasArray("a", aOffset, ((transaFlag ? m : k) - 1) + ((transaFlag ? k : m) - 1) * lda, a.length); + BlasUtils.checkBlasArray("b", bOffset, ((transbFlag ? k : n) - 1) + ((transbFlag ? n : k) - 1) * ldb, b.length); + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + dgemmVector(transa, transb, m, n, k, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc); + } + + private static void dgemmVector(String transa, String transb, int sizeM, int sizeN, int sizeK, double[] da, + int aOffset, int lda, double alpha, double[] db, int bOffset, int ldb, double[] dc, int cOffset, int ldc) { + int mc = Math.min(DGEMM_P, sizeM); + int nc = Math.min(DGEMM_R, sizeN); + int kc = Math.min(DGEMM_Q, sizeK); + double[] packa = new double[kc * mc]; + double[] packb = new double[kc * nc]; + for (int ns = 0; ns < sizeN; ns += nc) { + nc = Math.min(nc, sizeN - ns); + for (int ks = 0; ks < sizeK; ks += kc) { + kc = Math.min(kc, sizeK - ks); + if (Lsame.lsame(transb, "N")) { + DblasLevel3.onCopy(kc, nc, db, ks, ns, bOffset, ldb, packb, 0); // packing matrix b + } else { + otCopy(nc, kc, db, ns, ks, bOffset, ldb, packb, 0); + } + for (int ms = 0; ms < sizeM; ms += mc) { + mc = Math.min(mc, sizeM - ms); + if (Lsame.lsame(transa, "N")) { + DblasLevel3.itCopy(mc, kc, da, ms, ks, aOffset, lda, packa, 0); // packing matrix a + } else { + inCopy(kc, mc, da, ks, ms, aOffset, lda, packa, 0); + } + DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns); + } + } + } + } + + /** + * otCopy method is used for transpose packing matrix in the right. + * For example, when DGEMM_UNROLL_N = 4, + * before packing after packing + * 1 6 11 16 1 5 9 13 + * 2 7 12 17 ---> 2 6 10 14 + * 3 8 13 18 3 7 11 15 + * 4 9 14 19 4 8 12 16 + * 5 10 15 20 17 18 19 20 + */ + private static void otCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + double[] dst, int dstOffset) { + int row = 0; + int colPackSize = DGEMM_UNROLL_N; + int dstIndex = 0; + for (; row < sizeM - sizeM % colPackSize; row += colPackSize) { + int col = 0; + for (; col < sizeN; col += 1) { + System.arraycopy(src, (srcRow + row) + (srcCol + col) * srcLd + srcOffset, dst, + dstOffset + dstIndex, DGEMM_UNROLL_N); + dstIndex += colPackSize; + } + } + for (; row < sizeM; row += 1) { + int col = 0; + for (; col < sizeN; col += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dstIndex += 1; + } + } + } + + /** + * inCopy is used for normally packing matrix in the left. + * For example, when DGEMM_UNROLL_M = 4, + * before packing after packing + * 1 6 11 16 21 1 2 3 4 21 + * 2 7 12 17 22 ---> 5 6 7 8 22 + * 3 8 13 18 23 9 10 11 12 23 + * 4 9 14 19 24 13 14 15 16 24 + * 5 10 15 20 25 17 18 19 20 25 + */ + private static void inCopy(int sizeM, int sizeN, double[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + double[] dst, int dstOffset) { + int col = 0; + int dstIndex = 0; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (col + vectorLen <= sizeN) { + int row = 0; + for (; row < sizeM; row++) { + for (int count = 0; count < vectorLen; count++) { + dst[dstOffset + dstIndex + count] = src[srcOffset + (srcRow + row) + (srcCol + (col + count)) + * srcLd]; + } + dstIndex += vectorLen; + } + col += vectorLen; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java new file mode 100644 index 0000000000000000000000000000000000000000..2a1f9d8f051b1d2349de4d4ac619b5d4d0548133 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/doubleprecision/Dsymm.java @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.doubleprecision; + +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_P; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_Q; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_R; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.DGEMM_UNROLL_N; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH2; +import static com.huawei.vectorblas.blas3.doubleprecision.DblasLevel3.VECTOR_LENGTH4; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Dsymm { + public static void dsymm(String side, String uplo, int m, int n, double alpha, double[] a, int aOffset, int lda, + double[] b, int bOffset, int ldb, double beta, double[] c, int cOffset, int ldc) { + BlasUtils.checkParameter("DSYMM", 1, Lsame.lsame(side, "L") || Lsame.lsame(side, "R")); + BlasUtils.checkParameter("DSYMM", 2, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + boolean sideFlag = Lsame.lsame(side, "L"); + BlasUtils.checkParameter("DSYMM", 3, m >= 0); + BlasUtils.checkParameter("DSYMM", 4, n >= 0); + BlasUtils.checkParameter("DSYMM", 7, lda >= Math.max(1, (sideFlag ? m : n))); + BlasUtils.checkParameter("DSYMM", 9, ldb >= Math.max(1, m)); + BlasUtils.checkParameter("DSYMM", 12, ldc >= Math.max(1, m)); + + if (m == 0 || n == 0) { + return; + } + if (Double.compare(beta, 1.0d) != 0) { + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + DblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc); + } + if (BlasUtils.isZero(alpha)) { + return; + } + BlasUtils.checkBlasArray("a", aOffset, ((sideFlag ? m : n) - 1) + ((sideFlag ? m : n) - 1) * lda, a.length); + BlasUtils.checkBlasArray("b", bOffset, (m - 1) + (n - 1) * ldb, b.length); + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + dsymmVector(side, uplo, m, n, sideFlag ? m : n, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc); + } + + private static void dsymmVector(String side, String uplo, int sizeM, int sizeN, int sizeK, double[] da, int aOffset, + int lda, double alpha, double[] db, int bOffset, int ldb, double[] dc, int cOffset, int ldc) { + int mc = Math.min(DGEMM_P, sizeM); + int nc = Math.min(DGEMM_R, sizeN); + int kc = Math.min(DGEMM_Q, sizeK); + boolean sideFlag = Lsame.lsame(side, "L"); + double[] packa = new double[kc * (sideFlag ? mc : nc)]; + double[] packb = new double[kc * (sideFlag ? nc : mc)]; + for (int ns = 0; ns < sizeN; ns += nc) { + nc = Math.min(nc, sizeN - ns); + for (int ks = 0; ks < sizeK; ks += kc) { + kc = Math.min(kc, sizeK - ks); + if (Lsame.lsame(side, "L")) { + DblasLevel3.onCopy(kc, nc, db, ks, ns, bOffset, ldb, packb, 0); + } else if (Lsame.lsame(side, "R") && Lsame.lsame(uplo, "U")) { + outCopy(kc, nc, da, aOffset, lda, packa, 0, ns, ks); + } else { + oltCopy(kc, nc, da, aOffset, lda, packa, 0, ns, ks); + } + for (int ms = 0; ms < sizeM; ms += mc) { + mc = Math.min(mc, sizeM - ms); + if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "U")) { + iutCopy(kc, mc, da, aOffset, lda, packa, 0, ms, ks); + DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns); + } else if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "L")) { + iltCopy(kc, mc, da, aOffset, lda, packa, 0, ms, ks); + DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packa, packb, 0, dc, ldc, cOffset, ms, ns); + } else { + DblasLevel3.itCopy(mc, kc, db, ms, ks, bOffset, ldb, packb, 0); + DblasLevel3.kernelOperation8x4(mc, nc, kc, alpha, packb, packa, 0, dc, ldc, cOffset, ms, ns); + } + } + } + } + } + + /** + * oltCopy method is used for packing lower matrix in the right. + */ + private static void oltCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLenthList = {DGEMM_UNROLL_N, 1}; + for (int vectorLen : vectorLenthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posX + index + posY * srcLd; + } else { + offset[index] = posY + (posX + index) * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index] += srcLd; + } else { + offset[index]++; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * outCopy method is used for packing upper matrix in the right. + */ + private static void outCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLenthList = {DGEMM_UNROLL_N, 1}; + for (int vectorLen : vectorLenthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posY + (posX + index) * srcLd; + } else { + offset[index] = posX + index + posY * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index]++; + } else { + offset[index] += srcLd; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * iltCopy method is used for packing lower matrix in the left. + */ + private static void iltCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posX + index + posY * srcLd; + } else { + offset[index] = posY + (posX + index) * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index] += srcLd; + } else { + offset[index]++; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * iutCopy method is used for packing upper matrix in the left. + */ + private static void iutCopy(int sizeM, int sizeN, double[] src, int srcOffset, int srcLd, double[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posY + (posX + index) * srcLd; + } else { + offset[index] = posX + index + posY * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index]++; + } else { + offset[index] += srcLd; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java new file mode 100644 index 0000000000000000000000000000000000000000..b1ad539f00ca1859edd8115bf4aaa69b6c024894 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/SblasLevel3.java @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.singleprecision; + +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorSpecies; + +public class SblasLevel3 { + private static final VectorSpecies SSPECIES = FloatVector.SPECIES_MAX; + protected static final int SGEMM_P = 256; // Blocking size for m direction. + protected static final int SGEMM_Q = 256; // Blocking size for k direction. + protected static final int SGEMM_R = 8192; // Blocking size for n direction. + protected static final int VECTOR_LENGTH = SSPECIES.length(); + protected static final int VECTOR_LENGTH2 = 2 * VECTOR_LENGTH; + protected static final int VECTOR_LENGTH3 = 3 * VECTOR_LENGTH; + protected static final int VECTOR_LENGTH4 = 4 * VECTOR_LENGTH; + protected static final int SGEMM_UNROLL_M = 4 * VECTOR_LENGTH; + protected static final int SGEMM_UNROLL_N = 4; + + protected static void betaMulC(int sizeM, int sizeN, float beta, float[] sc, int cOffset, int ldc) { + FloatVector betav = FloatVector.broadcast(SSPECIES, beta); + for (int col = 0; col < sizeN; col++) { + int row = 0; + for (; row < sizeM - VECTOR_LENGTH; row += VECTOR_LENGTH) { + FloatVector cv = FloatVector.fromArray(SSPECIES, sc, row + col * ldc + cOffset); + cv.mul(betav).intoArray(sc, row + col * ldc + cOffset); + } + for (; row < sizeM; row++) { + sc[row + col * ldc + cOffset] *= beta; + } + } + } + + protected static void kernelOperation16x4(int mc, int nc, int kc, float alpha, float[] sa, float[] sb, int bOffset, + float[] sc, int ldc, int cOffset, int csRow, int csCol) { + kernelOperation16x4Main(mc, nc, kc, alpha, sa, sb, bOffset, sc, ldc, cOffset, csRow, csCol); + kernelOperation16x4NBorder(mc, nc, kc, alpha, sa, sb, bOffset, sc, ldc, cOffset, csRow, csCol); + } + + private static void kernelOperation16x4NBorder(int mc, int nc, int kc, float alpha, float[] sa, float[] sb, + int bOffset, float[] sc, int ldc, int cOffset, int csRow, int csCol) { + FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha); + int cCol = csCol + (nc / SGEMM_UNROLL_N) * SGEMM_UNROLL_N; + int countJ = nc % SGEMM_UNROLL_N; + for (; countJ > 0; countJ--) { + int cRow = csRow; + int aIndx = 0; + int countI = mc / SGEMM_UNROLL_M; + for (; countI > 0; countI--) { + int bIndx = (nc - countJ) * kc; + FloatVector c00 = FloatVector.zero(SSPECIES); + FloatVector c10 = FloatVector.zero(SSPECIES); + FloatVector c20 = FloatVector.zero(SSPECIES); + FloatVector c30 = FloatVector.zero(SSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH); + FloatVector a2 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH2); + FloatVector a3 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH3); + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c20 = a2.fma(b0, c20); + c30 = a3.fma(b0, c30); + + aIndx += SGEMM_UNROLL_M; + bIndx += 1; + } + alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + alphaVec.fma(c20, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2 + + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc); + alphaVec.fma(c30, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3 + + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc); + + cRow += SGEMM_UNROLL_M; + } + countI = mc % SGEMM_UNROLL_M; + if (countI >= VECTOR_LENGTH2) { + int bIndx = (nc - countJ) * kc; + FloatVector c00 = FloatVector.zero(SSPECIES); + FloatVector c10 = FloatVector.zero(SSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH); + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + + aIndx += VECTOR_LENGTH2; + bIndx += 1; + } + alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + + cRow += VECTOR_LENGTH2; + countI -= VECTOR_LENGTH2; + } + if (countI >= VECTOR_LENGTH) { + int bIndx = (nc - countJ) * kc; + FloatVector c00 = FloatVector.zero(SSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + c00 = a0.fma(b0, c00); + aIndx += VECTOR_LENGTH; + bIndx += 1; + } + alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc, + cOffset + cRow + cCol * ldc); + + cRow += VECTOR_LENGTH; + countI -= VECTOR_LENGTH; + } + while (countI > 0) { + int bIndx = (nc - countJ) * kc; + float[] cTmp = new float[1]; + int countL = kc; + for (; countL > 0; countL--) { + cTmp[0] += sa[aIndx] * sb[bIndx]; + aIndx += 1; + bIndx += 1; + } + sc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0]; + + cRow += 1; + countI -= 1; + } + cCol += 1; + } + } + + private static void kernelOperation16x4Main(int mc, int nc, int kc, float alpha, float[] sa, float[] sb, + int bOffset, float[] sc, int ldc, int cOffset, int csRow, int csCol) { + FloatVector alphaVec = FloatVector.broadcast(SSPECIES, alpha); + int countJ = nc / SGEMM_UNROLL_N; + int cCol = csCol; + for (; countJ > 0; countJ--) { + int cRow = csRow; + int aIndx = 0; + int countI = mc / SGEMM_UNROLL_M; + for (; countI > 0; countI--) { + FloatVector c00 = FloatVector.zero(SSPECIES); + FloatVector c10 = FloatVector.zero(SSPECIES); + FloatVector c20 = FloatVector.zero(SSPECIES); + FloatVector c30 = FloatVector.zero(SSPECIES); + FloatVector c01 = FloatVector.zero(SSPECIES); + FloatVector c11 = FloatVector.zero(SSPECIES); + FloatVector c21 = FloatVector.zero(SSPECIES); + FloatVector c31 = FloatVector.zero(SSPECIES); + FloatVector c02 = FloatVector.zero(SSPECIES); + FloatVector c12 = FloatVector.zero(SSPECIES); + FloatVector c22 = FloatVector.zero(SSPECIES); + FloatVector c32 = FloatVector.zero(SSPECIES); + FloatVector c03 = FloatVector.zero(SSPECIES); + FloatVector c13 = FloatVector.zero(SSPECIES); + FloatVector c23 = FloatVector.zero(SSPECIES); + FloatVector c33 = FloatVector.zero(SSPECIES); + int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc; + int countL = kc; + for (; countL > 0; countL--) { + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH); + FloatVector a2 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH2); + FloatVector a3 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH3); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c20 = a2.fma(b0, c20); + c30 = a3.fma(b0, c30); + + FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]); + c01 = a0.fma(b1, c01); + c11 = a1.fma(b1, c11); + c21 = a2.fma(b1, c21); + c31 = a3.fma(b1, c31); + + FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]); + c02 = a0.fma(b2, c02); + c12 = a1.fma(b2, c12); + c22 = a2.fma(b2, c22); + c32 = a3.fma(b2, c32); + + FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]); + c03 = a0.fma(b3, c03); + c13 = a1.fma(b3, c13); + c23 = a2.fma(b3, c23); + c33 = a3.fma(b3, c33); + aIndx += SGEMM_UNROLL_M; + bIndx += SGEMM_UNROLL_N; + } + FloatVector cOri00 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc); + FloatVector cOri10 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc); + FloatVector cOri20 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2 + + cCol * ldc); + FloatVector cOri30 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3 + + cCol * ldc); + + cOri00 = alphaVec.fma(c00, cOri00); + cOri10 = alphaVec.fma(c10, cOri10); + cOri20 = alphaVec.fma(c20, cOri20); + cOri30 = alphaVec.fma(c30, cOri30); + + cOri00.intoArray(sc, cOffset + cRow + cCol * ldc); + cOri10.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + cOri20.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + cCol * ldc); + cOri30.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + cCol * ldc); + + FloatVector cOri01 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc); + FloatVector cOri11 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 1) * ldc); + FloatVector cOri21 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 1) * ldc); + FloatVector cOri31 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 1) * ldc); + + cOri01 = alphaVec.fma(c01, cOri01); + cOri11 = alphaVec.fma(c11, cOri11); + cOri21 = alphaVec.fma(c21, cOri21); + cOri31 = alphaVec.fma(c31, cOri31); + + cOri01.intoArray(sc, cOffset + cRow + (cCol + 1) * ldc); + cOri11.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc); + cOri21.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 1) * ldc); + cOri31.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 1) * ldc); + + FloatVector cOri02 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc); + FloatVector cOri12 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 2) * ldc); + FloatVector cOri22 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 2) * ldc); + FloatVector cOri32 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 2) * ldc); + + cOri02 = alphaVec.fma(c02, cOri02); + cOri12 = alphaVec.fma(c12, cOri12); + cOri22 = alphaVec.fma(c22, cOri22); + cOri32 = alphaVec.fma(c32, cOri32); + + cOri02.intoArray(sc, cOffset + cRow + (cCol + 2) * ldc); + cOri12.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc); + cOri22.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 2) * ldc); + cOri32.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 2) * ldc); + + FloatVector cOri03 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc); + FloatVector cOri13 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 3) * ldc); + FloatVector cOri23 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH2 + + (cCol + 3) * ldc); + FloatVector cOri33 = FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH3 + + (cCol + 3) * ldc); + + cOri03 = alphaVec.fma(c03, cOri03); + cOri13 = alphaVec.fma(c13, cOri13); + cOri23 = alphaVec.fma(c23, cOri23); + cOri33 = alphaVec.fma(c33, cOri33); + + cOri03.intoArray(sc, cOffset + cRow + (cCol + 3) * ldc); + cOri13.intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc); + cOri23.intoArray(sc, cOffset + cRow + VECTOR_LENGTH2 + (cCol + 3) * ldc); + cOri33.intoArray(sc, cOffset + cRow + VECTOR_LENGTH3 + (cCol + 3) * ldc); + + cRow += SGEMM_UNROLL_M; + } + countI = mc % SGEMM_UNROLL_M; + if (countI >= VECTOR_LENGTH2) { + int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc; + FloatVector c00 = FloatVector.zero(SSPECIES); + FloatVector c10 = FloatVector.zero(SSPECIES); + FloatVector c01 = FloatVector.zero(SSPECIES); + FloatVector c11 = FloatVector.zero(SSPECIES); + FloatVector c02 = FloatVector.zero(SSPECIES); + FloatVector c12 = FloatVector.zero(SSPECIES); + FloatVector c03 = FloatVector.zero(SSPECIES); + FloatVector c13 = FloatVector.zero(SSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + FloatVector a1 = FloatVector.fromArray(SSPECIES, sa, aIndx + VECTOR_LENGTH); + + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]); + FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]); + FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]); + + c00 = a0.fma(b0, c00); + c10 = a1.fma(b0, c10); + c01 = a0.fma(b1, c01); + c11 = a1.fma(b1, c11); + + c02 = a0.fma(b2, c02); + c12 = a1.fma(b2, c12); + c03 = a0.fma(b3, c03); + c13 = a1.fma(b3, c13); + + aIndx += VECTOR_LENGTH2; + bIndx += SGEMM_UNROLL_N; + } + alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c10, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + cCol * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + cCol * ldc); + + alphaVec.fma(c01, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 1) * ldc); + alphaVec.fma(c11, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 1) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 1) * ldc); + + alphaVec.fma(c02, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 2) * ldc); + alphaVec.fma(c12, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 2) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 2) * ldc); + + alphaVec.fma(c03, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 3) * ldc); + alphaVec.fma(c13, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + VECTOR_LENGTH + + (cCol + 3) * ldc)).intoArray(sc, cOffset + cRow + VECTOR_LENGTH + (cCol + 3) * ldc); + + cRow += VECTOR_LENGTH2; + countI -= VECTOR_LENGTH2; + } + if (countI >= VECTOR_LENGTH) { + int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc; + FloatVector c00 = FloatVector.zero(SSPECIES); + FloatVector c01 = FloatVector.zero(SSPECIES); + FloatVector c02 = FloatVector.zero(SSPECIES); + FloatVector c03 = FloatVector.zero(SSPECIES); + int countL = kc; + for (; countL > 0; countL--) { + FloatVector a0 = FloatVector.fromArray(SSPECIES, sa, aIndx); + + FloatVector b0 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx]); + FloatVector b1 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 1]); + FloatVector b2 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 2]); + FloatVector b3 = FloatVector.broadcast(SSPECIES, sb[bOffset + bIndx + 3]); + + c00 = a0.fma(b0, c00); + c01 = a0.fma(b1, c01); + c02 = a0.fma(b2, c02); + c03 = a0.fma(b3, c03); + + aIndx += VECTOR_LENGTH; + bIndx += SGEMM_UNROLL_N; + } + alphaVec.fma(c00, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + cCol * ldc)).intoArray(sc, + cOffset + cRow + cCol * ldc); + alphaVec.fma(c01, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 1) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 1) * ldc); + alphaVec.fma(c02, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 2) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 2) * ldc); + alphaVec.fma(c03, FloatVector.fromArray(SSPECIES, sc, cOffset + cRow + (cCol + 3) * ldc)).intoArray(sc, + cOffset + cRow + (cCol + 3) * ldc); + + cRow += VECTOR_LENGTH; + countI -= VECTOR_LENGTH; + } + while (countI > 0) { + int bIndx = (nc / SGEMM_UNROLL_N - countJ) * SGEMM_UNROLL_N * kc; + float[] cTmp = new float[SGEMM_UNROLL_N]; + int countL = kc; + for (; countL > 0; countL--) { + cTmp[0] += sa[aIndx] * sb[bOffset + bIndx]; + cTmp[1] += sa[aIndx] * sb[bOffset + bIndx + 1]; + cTmp[2] += sa[aIndx] * sb[bOffset + bIndx + 2]; + cTmp[3] += sa[aIndx] * sb[bOffset + bIndx + 3]; + aIndx += 1; + bIndx += SGEMM_UNROLL_N; + } + sc[cOffset + cRow + cCol * ldc] += alpha * cTmp[0]; + sc[cOffset + cRow + (cCol + 1) * ldc] += alpha * cTmp[1]; + sc[cOffset + cRow + (cCol + 2) * ldc] += alpha * cTmp[2]; + sc[cOffset + cRow + (cCol + 3) * ldc] += alpha * cTmp[3]; + + cRow += 1; + countI -= 1; + } + cCol += SGEMM_UNROLL_N; + } + } + + /** + * onCopy is used for normally packing matrix in the right. + */ + protected static void onCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + float[] dst, int dstOffset) { + int col = 0; + int colPackSize = SGEMM_UNROLL_N; + int dstIndex = 0; + for (; col < sizeN - sizeN % colPackSize; col += colPackSize) { + int row = 0; + for (; row < sizeM; row += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 1] = src[(srcRow + row) + (srcCol + (col + 1)) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 2] = src[(srcRow + row) + (srcCol + (col + 2)) * srcLd + srcOffset]; + dst[dstOffset + dstIndex + 3] = src[(srcRow + row) + (srcCol + (col + 3)) * srcLd + srcOffset]; + dstIndex += colPackSize; + } + } + for (; col < sizeN; col += 1) { + int row = 0; + for (; row < sizeM; row += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dstIndex += 1; + } + } + } + + /** + * itCopy is used for transpose packing matrix in the left. + */ + protected static void itCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + float[] dst, int dstOffset) { + int row = 0; + int dstIndex = 0; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (row + vectorLen <= sizeM) { + int col = 0; + for (; col < sizeN; col++) { + System.arraycopy(src, srcOffset + (srcRow + row) + (srcCol + col) * srcLd, dst, + dstOffset + dstIndex, vectorLen); + dstIndex += vectorLen; + } + row += vectorLen; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java new file mode 100644 index 0000000000000000000000000000000000000000..d134adaea0ee32b92597ea7f29f4589fe0d42087 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Sgemm.java @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.singleprecision; + +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_P; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_Q; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_R; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_UNROLL_N; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH2; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH4; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Sgemm { + public static void sgemm(String transa, String transb, int m, int n, int k, float alpha, float[] a, int aOffset, + int lda, float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + BlasUtils.checkParameter("SGEMM", 1, Lsame.lsame(transa, "N") || Lsame.lsame(transa, "T")); + BlasUtils.checkParameter("SGEMM", 2, Lsame.lsame(transb, "N") || Lsame.lsame(transb, "T")); + boolean transaFlag = Lsame.lsame(transa, "N"); + boolean transbFlag = Lsame.lsame(transb, "N"); + BlasUtils.checkParameter("SGEMM", 3, m >= 0); + BlasUtils.checkParameter("SGEMM", 4, n >= 0); + BlasUtils.checkParameter("SGEMM", 5, k >= 0); + BlasUtils.checkParameter("SGEMM", 8, lda >= Math.max(1, (transaFlag ? m : k))); + BlasUtils.checkParameter("SGEMM", 10, ldb >= Math.max(1, (transbFlag ? k : n))); + BlasUtils.checkParameter("SGEMM", 13, ldc >= Math.max(1, m)); + + if (m == 0 || n == 0) { + return; + } + if (Float.compare(beta, 1.0f) != 0) { + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + SblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc); + } + if (BlasUtils.isZero(alpha) || k == 0) { + return; + } + BlasUtils.checkBlasArray("a", aOffset, ((transaFlag ? m : k) - 1) + ((transaFlag ? k : m) - 1) * lda, a.length); + BlasUtils.checkBlasArray("b", bOffset, ((transbFlag ? k : n) - 1) + ((transbFlag ? n : k) - 1) * ldb, b.length); + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + sgemmVector(transa, transb, m, n, k, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc); + } + + private static void sgemmVector(String transa, String transb, int sizeM, int sizeN, int sizeK, float[] sa, + int aOffset, int lda, float alpha, float[] sb, int bOffset, int ldb, float[] sc, int cOffset, int ldc) { + int mc = Math.min(SGEMM_P, sizeM); + int nc = Math.min(SGEMM_R, sizeN); + int kc = Math.min(SGEMM_Q, sizeK); + float[] packa = new float[kc * mc]; + float[] packb = new float[kc * nc]; + for (int ns = 0; ns < sizeN; ns += nc) { + nc = Math.min(nc, sizeN - ns); + for (int ks = 0; ks < sizeK; ks += kc) { + kc = Math.min(kc, sizeK - ks); + if (Lsame.lsame(transb, "N")) { + SblasLevel3.onCopy(kc, nc, sb, ks, ns, bOffset, ldb, packb, 0); // packing matrix b + } else { + otCopy(nc, kc, sb, ns, ks, bOffset, ldb, packb, 0); + } + for (int ms = 0; ms < sizeM; ms += mc) { + mc = Math.min(mc, sizeM - ms); + if (Lsame.lsame(transa, "N")) { + SblasLevel3.itCopy(mc, kc, sa, ms, ks, aOffset, lda, packa, 0); // packing matrix a + } else { + inCopy(kc, mc, sa, ks, ms, aOffset, lda, packa, 0); + } + SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns); + } + } + } + } + + /** + * otCopy method is used for transpose packing matrix in the right. + */ + private static void otCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + float[] dst, int dstOffset) { + int row = 0; + int colPackSize = SGEMM_UNROLL_N; + int dstIndex = 0; + for (; row < sizeM - sizeM % colPackSize; row += colPackSize) { + int col = 0; + for (; col < sizeN; col += 1) { + System.arraycopy(src, (srcRow + row) + (srcCol + col) * srcLd + srcOffset, dst, + dstOffset + dstIndex, SGEMM_UNROLL_N); + dstIndex += colPackSize; + } + } + for (; row < sizeM; row += 1) { + int col = 0; + for (; col < sizeN; col += 1) { + dst[dstOffset + dstIndex] = src[(srcRow + row) + (srcCol + col) * srcLd + srcOffset]; + dstIndex += 1; + } + } + } + + /** + * inCopy is used for normally packing matrix in the left. + */ + private static void inCopy(int sizeM, int sizeN, float[] src, int srcRow, int srcCol, int srcOffset, int srcLd, + float[] dst, int dstOffset) { + int col = 0; + int dstIndex = 0; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (col + vectorLen <= sizeN) { + int row = 0; + for (; row < sizeM; row++) { + for (int count = 0; count < vectorLen; count++) { + dst[dstOffset + dstIndex + count] = src[srcOffset + (srcRow + row) + (srcCol + (col + count)) + * srcLd]; + } + dstIndex += vectorLen; + } + col += vectorLen; + } + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java new file mode 100644 index 0000000000000000000000000000000000000000..507077e5d2fe1f2d9367e571765f148f55a6ab51 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/blas3/singleprecision/Ssymm.java @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.blas3.singleprecision; + +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_P; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_Q; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_R; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.SGEMM_UNROLL_N; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH2; +import static com.huawei.vectorblas.blas3.singleprecision.SblasLevel3.VECTOR_LENGTH4; + +import com.huawei.vectorblas.utils.BlasUtils; +import com.huawei.vectorblas.utils.Lsame; + +public class Ssymm { + public static void ssymm(String side, String uplo, int m, int n, float alpha, float[] a, int aOffset, int lda, + float[] b, int bOffset, int ldb, float beta, float[] c, int cOffset, int ldc) { + BlasUtils.checkParameter("SSYMM", 1, Lsame.lsame(side, "L") || Lsame.lsame(side, "R")); + BlasUtils.checkParameter("SSYMM", 2, Lsame.lsame(uplo, "U") || Lsame.lsame(uplo, "L")); + boolean sideFlag = Lsame.lsame(side, "L"); + BlasUtils.checkParameter("SSYMM", 3, m >= 0); + BlasUtils.checkParameter("SSYMM", 4, n >= 0); + BlasUtils.checkParameter("SSYMM", 7, lda >= Math.max(1, (sideFlag ? m : n))); + BlasUtils.checkParameter("SSYMM", 9, ldb >= Math.max(1, m)); + BlasUtils.checkParameter("SSYMM", 12, ldc >= Math.max(1, m)); + + if (m == 0 || n == 0) { + return; + } + if (Float.compare(beta, 1.0f) != 0) { + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + SblasLevel3.betaMulC(m, n, beta, c, cOffset, ldc); + } + if (BlasUtils.isZero(alpha)) { + return; + } + BlasUtils.checkBlasArray("a", aOffset, ((sideFlag ? m : n) - 1) + ((sideFlag ? m : n) - 1) * lda, a.length); + BlasUtils.checkBlasArray("b", bOffset, (m - 1) + (n - 1) * ldb, b.length); + BlasUtils.checkBlasArray("c", cOffset, (m - 1) + (n - 1) * ldc, c.length); + ssymmVector(side, uplo, m, n, sideFlag ? m : n, a, aOffset, lda, alpha, b, bOffset, ldb, c, cOffset, ldc); + } + + private static void ssymmVector(String side, String uplo, int sizeM, int sizeN, int sizeK, float[] sa, int aOffset, + int lda, float alpha, float[] sb, int bOffset, int ldb, float[] sc, int cOffset, int ldc) { + int mc = Math.min(SGEMM_P, sizeM); + int nc = Math.min(SGEMM_R, sizeN); + int kc = Math.min(SGEMM_Q, sizeK); + boolean sideFlag = Lsame.lsame(side, "L"); + float[] packa = new float[kc * (sideFlag ? mc : nc)]; + float[] packb = new float[kc * (sideFlag ? nc : mc)]; + for (int ns = 0; ns < sizeN; ns += nc) { + nc = Math.min(nc, sizeN - ns); + for (int ks = 0; ks < sizeK; ks += kc) { + kc = Math.min(kc, sizeK - ks); + if (Lsame.lsame(side, "L")) { + SblasLevel3.onCopy(kc, nc, sb, ks, ns, bOffset, ldb, packb, 0); + } else if (Lsame.lsame(side, "R") && Lsame.lsame(uplo, "U")) { + outCopy(kc, nc, sa, aOffset, lda, packa, 0, ns, ks); + } else { + oltCopy(kc, nc, sa, aOffset, lda, packa, 0, ns, ks); + } + for (int ms = 0; ms < sizeM; ms += mc) { + mc = Math.min(mc, sizeM - ms); + if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "U")) { + iutCopy(kc, mc, sa, aOffset, lda, packa, 0, ms, ks); + SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns); + } else if (Lsame.lsame(side, "L") && Lsame.lsame(uplo, "L")) { + iltCopy(kc, mc, sa, aOffset, lda, packa, 0, ms, ks); + SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packa, packb, 0, sc, ldc, cOffset, ms, ns); + } else { + SblasLevel3.itCopy(mc, kc, sb, ms, ks, bOffset, ldb, packb, 0); + SblasLevel3.kernelOperation16x4(mc, nc, kc, alpha, packb, packa, 0, sc, ldc, cOffset, ms, ns); + } + } + } + } + } + + /** + * oltCopy method is used for packing lower matrix in the right. + */ + private static void oltCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLenthList = {SGEMM_UNROLL_N, 1}; + for (int vectorLen : vectorLenthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posX + index + posY * srcLd; + } else { + offset[index] = posY + (posX + index) * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index] += srcLd; + } else { + offset[index]++; + } + } + delta--; + } + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * outCopy method is used for packing upper matrix in the right. + */ + private static void outCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLenthList = {SGEMM_UNROLL_N, 1}; + for (int vectorLen : vectorLenthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posY + (posX + index) * srcLd; + } else { + offset[index] = posX + index + posY * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index]++; + } else { + offset[index] += srcLd; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * iltCopy method is used for packing lower matrix in the left. + */ + private static void iltCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posX + index + posY * srcLd; + } else { + offset[index] = posY + (posX + index) * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index] += srcLd; + } else { + offset[index]++; + } + } + delta--; + } + + posX += vectorLen; + countJ -= vectorLen; + } + } + } + + /** + * iutCopy method is used for packing upper matrix in the left. + */ + private static void iutCopy(int sizeM, int sizeN, float[] src, int srcOffset, int srcLd, float[] dst, + int dstOffset, int posX, int posY) { + int dstIndex = 0; + int countJ = sizeN; + int[] vectorLengthList = {VECTOR_LENGTH4, VECTOR_LENGTH2, VECTOR_LENGTH, 1}; + for (int vectorLen : vectorLengthList) { + while (countJ - vectorLen >= 0) { + int delta = posX - posY; + int[] offset = new int[vectorLen]; + for (int index = 0; index < vectorLen; index++) { + if (delta > -index) { + offset[index] = posY + (posX + index) * srcLd; + } else { + offset[index] = posX + index + posY * srcLd; + } + } + + int countI = sizeM; + for (; countI > 0; countI--) { + // read and write data + for (int index = 0; index < vectorLen; index++) { + dst[dstOffset + dstIndex] = src[srcOffset + offset[index]]; + dstIndex += 1; + if (delta > -index) { + offset[index]++; + } else { + offset[index] += srcLd; + } + } + delta--; + } + posX += vectorLen; + countJ -= vectorLen; + } + } + } +} \ No newline at end of file diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java new file mode 100644 index 0000000000000000000000000000000000000000..61aba91f6f839afec455ae2c13b9868a9b933da9 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/ArrayUtil.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.utils; + +import java.util.Random; + +public class ArrayUtil { + public static int loopBound(int length, int size) { + return roundDown(length, size); + } + + private static int roundDown(int length, int size) { + if ((size & (size - 1)) == 0) { + // Size is zero or a power of two, so we got this. + return length & ~(size - 1); + } else { + return roundDownNPOT(length, size); + } + } + + private static int roundDownNPOT(int length, int size) { + if (length >= 0) { + return length - (length % size); + } else { + return length - Math.floorMod(length, Math.abs(size)); + } + } + + private static final Random RANDOM = new Random(0); + + public static double randomDouble() { + return RANDOM.nextDouble(); + } + + public static void randomDoubleArray(double[] arr) { + for (int i = 0; i < arr.length; i++) { + arr[i] = RANDOM.nextDouble() - 0.5d; // Produce double values between -0.5 and 0.5. + } + } + + public static float randomFloat() { + return RANDOM.nextFloat(); + } + + public static void randomFloatArray(float[] arr) { + for (int i = 0; i < arr.length; i++) { + arr[i] = RANDOM.nextFloat() - 0.5f; // Produce float values between -0.5 and 0.5. + } + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..aa65bab61e687e87db01b11fad6fab23a93c450c --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/BlasUtils.java @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.utils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Locale; +import java.util.Random; + +public class BlasUtils { + private static final Logger LOG = LoggerFactory.getLogger(BlasUtils.class); + private static Random rand = new Random(0); + + public static void checkParameter(String name, int index, boolean isValid) { + if (!isValid) { + String msg = String.format(Locale.ROOT, + "** On entry to %s parameter number %d had an illegal value", name, index); + throw new IllegalArgumentException(msg); + } + } + + public static void checkBlasArray(String arrName, int offset, int index, int length) { + try { + checkBound(index + offset, length); + checkBound(offset, length); + } catch (ArrayIndexOutOfBoundsException e) { + throw new ArrayIndexOutOfBoundsException( + "Index " + index + " of array " + arrName + " out of bounds for length: " + length); + } + } + + public static void checkBound(int index, int length) { + if (index < 0 || index >= length) { + throw new ArrayIndexOutOfBoundsException(); + } + } + + public static boolean isZero(double val) { + return Double.compare(val, 0.0d) == 0 || Double.compare(val, -0.0d) == 0; + } + + public static boolean isZero(float val) { + return Float.compare(val, 0.0f) == 0 || Float.compare(val, -0.0f) == 0; + } + + /** + * Get double precision machine epsilon. + */ + public static double getEpsd() { + double eps; + double half = 0.5d; + double maxVal; + double f1 = 0.5d; + do { + eps = f1; + f1 *= half; + maxVal = 1.0d + f1; + } while (Double.compare(maxVal, 1.0d) != 0); + return eps; + } + + /** + * Get single precision machine epsilon. + */ + public static float getEpsf() { + float eps; + float half = 0.5f; + float maxVal; + float f1 = 0.5f; + do { + eps = f1; + f1 *= half; + maxVal = 1.0f + f1; + } while (Float.compare(maxVal, 1.0f) != 0); + return eps; + } + + /** + * Generate general matrix of double precision. + */ + public static void gegen(int sizeM, int sizeN, double[] da, int aOffset, int lda) { + try { + int idx = (lda >= 0 ? 0 : (sizeN - 1) * -lda) + aOffset; + for (int j = 0; j < sizeN; j++) { + for (int i = 0; i < sizeM; i++) { + da[idx + i] = rand.nextDouble() - 0.5d; + } + idx += lda; + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + } + + /** + * Generate general matrix of single precision. + */ + public static void gegen(int sizeM, int sizeN, float[] sa, int aOffset, int lda) { + try { + int idx = (lda >= 0 ? 0 : (sizeN - 1) * -lda) + aOffset; + for (int j = 0; j < sizeN; j++) { + for (int i = 0; i < sizeM; i++) { + sa[idx + i] = rand.nextFloat() - 0.5f; + } + idx += lda; + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + } + + /** + * Calculates the infinity norm of single precision vector. + */ + public static float getInfnrm(int sizeN, float[] sx, int xOffset, int incX) { + int idx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset; + float max = 0.0f; + try { + for (int i = 0; i < sizeN; i++, idx += incX) { + max = Math.max(Math.abs(sx[idx]), max); + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + return max; + } + + /** + * Calculates the infinity norm of double precision vector. + */ + public static double getInfnrm(int sizeN, double[] dx, int xOffset, int incX) { + int idx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset; + double max = 0.0d; + try { + for (int i = 0; i < sizeN; i++, idx += incX) { + max = Math.max(Math.abs(dx[idx]), max); + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + return max; + } + + /** + * Calculates the difference between two double precision vectors. + */ + public static void getVdiff(int sizeN, double[] dx, int xOffset, int incX, double[] dy, int yOffset, int incY, + double[] dz, int zOffset, int incZ) { + int xIdx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset; + int yIdx = (incY >= 0 ? 0 : (sizeN - 1) * -incY) + yOffset; + int zIdx = (incZ >= 0 ? 0 : (sizeN - 1) * -incZ) + zOffset; + for (int i = sizeN; i > 0; i--, xIdx += incX, yIdx += incY, zIdx += incZ) { + dz[zIdx] = dx[xIdx] - dy[yIdx]; + } + } + + /** + * Calculates the difference between two single precision vectors. + */ + public static void getVdiff(int sizeN, float[] sx, int xOffset, int incX, float[] sy, int yOffset, int incY, + float[] sz, int zOffset, int incZ) { + int xIdx = (incX >= 0 ? 0 : (sizeN - 1) * -incX) + xOffset; + int yIdx = (incY >= 0 ? 0 : (sizeN - 1) * -incY) + yOffset; + int zIdx = (incZ >= 0 ? 0 : (sizeN - 1) * -incZ) + zOffset; + for (int i = sizeN; i > 0; i--, xIdx += incX, yIdx += incY, zIdx += incZ) { + sz[zIdx] = sx[xIdx] - sy[yIdx]; + } + } + + /** + * Calculates the 1-norm of a general rectangular matrix of double precision. + */ + public static double getGenrm1(int sizeM, int sizeN, double[] da, int aOffset, int lda) { + double max = 0.0d; + int offset = aOffset; + try { + for (int j = 0; j < sizeN; j++) { + double t0 = org.netlib.blas.Dasum.dasum(sizeM, da, offset, 1); + max = Math.max(t0, max); + offset += lda; + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + return max; + } + + /** + * Calculates the 1-norm of a general rectangular matrix of single precision. + */ + public static float getGenrm1(int sizeM, int sizeN, float[] sa, int aOffset, int lda) { + float max = 0.0f; + int offset = aOffset; + try { + for (int j = 0; j < sizeN; j++) { + float t0 = org.netlib.blas.Sasum.sasum(sizeM, sa, offset, 1); + max = Math.max(t0, max); + offset += lda; + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + return max; + } + + /** + * Calculates the 1-norm of (A-B) matrix of double precision. + */ + public static double getGediffnrm1(int sizeM, int sizeN, double[] da, int aOffset, int lda, + double[] db, int bOffset, int ldb) { + double max = 0.0d; + int offset1 = aOffset; + int offset2 = bOffset; + for (int j = 0; j < sizeN; j++) { + double t0 = 0.0d; + for (int i = 0; i < sizeM; i++) { + t0 += Math.abs(da[offset1] - db[offset2]); + } + max = Math.max(t0, max); + offset1 += lda; + offset2 += ldb; + } + return max; + } + + /** + * Calculates the 1-norm of (A-B) matrix of single precision. + */ + public static float getGediffnrm1(int sizeM, int sizeN, float[] sa, int aOffset, int lda, + float[] sb, int bOffset, int ldb) { + float max = 0.0f; + int offset1 = aOffset; + int offset2 = bOffset; + for (int j = 0; j < sizeN; j++) { + float t0 = 0.0f; + for (int i = 0; i < sizeM; i++) { + t0 += Math.abs(sa[offset1] - sb[offset2]); + } + max = Math.max(t0, max); + offset1 += lda; + offset2 += ldb; + } + return max; + } + + /** + * Calculates the norm of a double precision symmetric packed matrix. + */ + public static double getSpnrm(String uplo, int sizeN, double[] da, int aOffset) { + if (sizeN <= 0) { + return 0.0d; + } + double[] work = new double[sizeN]; + try { + if (uplo.equalsIgnoreCase("U")) { + for (int j = 0, iaij = 0; j < sizeN; j++) { + double t0 = 0.0d; + for (int i = 0; i < j; i++, iaij++) { + work[i] += Math.abs(da[iaij + aOffset]); + t0 += Math.abs(da[iaij + aOffset]); + } + work[j] += Math.abs(da[iaij + aOffset]) + t0; + iaij++; + } + } else { + for (int j = 0, iaij = 0; j < sizeN; j++) { + double t0 = 0.0d; + work[j] = Math.abs(da[iaij + aOffset]); + iaij++; + for (int i = j + 1; i < sizeN; i++, iaij++) { + work[i] += Math.abs(da[iaij + aOffset]); + t0 += Math.abs(da[iaij + aOffset]); + } + work[j] += t0; + } + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + double max = work[0]; + for (int j = 1; j < sizeN; j++) { + max = Math.max(max, work[j]); + } + return max; + } + + /** + * Calculates the norm of a single precision symmetric packed matrix. + */ + public static float getSpnrm(String uplo, int sizeN, float[] sa, int aOffset) { + if (sizeN <= 0) { + return 0.0f; + } + float[] work = new float[sizeN]; + try { + if (uplo.equalsIgnoreCase("U")) { + for (int j = 0, iaij = 0; j < sizeN; j++) { + float t0 = 0.0f; + for (int i = 0; i < j; i++, iaij++) { + work[i] += Math.abs(sa[iaij + aOffset]); + t0 += Math.abs(sa[iaij + aOffset]); + } + work[j] += Math.abs(sa[iaij + aOffset]) + t0; + iaij++; + } + } else { + for (int j = 0, iaij = 0; j < sizeN; j++) { + float t0 = 0.0f; + work[j] = Math.abs(sa[iaij + aOffset]); + iaij++; + for (int i = j + 1; i < sizeN; i++, iaij++) { + work[i] += Math.abs(sa[iaij + aOffset]); + t0 += Math.abs(sa[iaij + aOffset]); + } + work[j] += t0; + } + } + } catch (ArrayIndexOutOfBoundsException e) { + LOG.error(e.toString()); + } + float max = work[0]; + for (int j = 1; j < sizeN; j++) { + max = Math.max(max, work[j]); + } + return max; + } + + /** + * Calculates the norm of a upper or lower triangular part of the double precision symmetric matrix. + */ + public static double getSynrm(String uplo, int sizeN, double[] da, int aOffset, int lda) { + int ldap12 = lda + 1; + if (sizeN <= 0) { + return 0.0d; + } + double[] work = new double[sizeN]; + if (uplo.equalsIgnoreCase("U")) { + for (int j = 0, jaj = 0; j < sizeN; j++, jaj += lda) { + double t0 = 0.0d; + int iaij = jaj; + for (int i = 0; i < j; i++, iaij++) { + work[i] += Math.abs(da[iaij + aOffset]); + t0 += Math.abs(da[iaij + aOffset]); + } + work[j] += Math.abs(da[iaij + aOffset]) + t0; + } + } else { + for (int j = 0, jaj = 0; j < sizeN; j++, jaj += ldap12) { + double t0 = 0.0d; + work[j] = Math.abs(da[jaj + aOffset]); + for (int i = j + 1, iaij = jaj + 1; i < sizeN; i++, iaij++) { + work[i] += Math.abs(da[iaij + aOffset]); + t0 += Math.abs(da[iaij + aOffset]); + } + work[j] += t0; + } + } + double max = work[0]; + for (int j = 1; j < sizeN; j++) { + max = Math.max(work[j], max); + } + return max; + } + + /** + * Calculates the norm of a upper or lower triangular part of the single precision symmetric matrix. + */ + public static float getSynrm(String uplo, int sizeN, float[] sa, int aOffset, int lda) { + int ldap12 = lda + 1; + if (sizeN <= 0) { + return 0.0f; + } + float[] work = new float[sizeN]; + if (uplo.equalsIgnoreCase("U")) { + for (int j = 0, jaj = 0; j < sizeN; j++, jaj += lda) { + float t0 = 0.0f; + int iaij = jaj; + for (int i = 0; i < j; i++, iaij++) { + work[i] += Math.abs(sa[iaij + aOffset]); + t0 += Math.abs(sa[iaij + aOffset]); + } + work[j] += Math.abs(sa[iaij + aOffset]) + t0; + } + } else { + for (int j = 0, jaj = 0; j < sizeN; j++, jaj += ldap12) { + float t0 = 0.0f; + work[j] = Math.abs(sa[jaj + aOffset]); + for (int i = j + 1, iaij = jaj + 1; i < sizeN; i++, iaij++) { + work[i] += Math.abs(sa[iaij + aOffset]); + t0 += Math.abs(sa[iaij + aOffset]); + } + work[j] += t0; + } + } + float max = work[0]; + for (int j = 1; j < sizeN; j++) { + max = Math.max(work[j], max); + } + return max; + } +} diff --git a/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java new file mode 100644 index 0000000000000000000000000000000000000000..8da45426d0a1da0acec22a733c7c40eaaa7a7278 --- /dev/null +++ b/vectorBlas/src/main/java/com/huawei/vectorblas/utils/Lsame.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2023. Huawei Technologies Co., Ltd. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.huawei.vectorblas.utils; + +public final class Lsame { + /** + * LSAME returns .TRUE. if CA is the same letter as CB regardless of case. + * + * @param cA character a + * @param cB character b + * @return true or false + */ + public static boolean lsame(String cA, String cB) { + return cA != null && cA.regionMatches(true, 0, cB, 0, cA.length()); + } +} \ No newline at end of file