From 6288c8579ce43fdf9b48df9271e652572334e83e Mon Sep 17 00:00:00 2001 From: Caoruihong Date: Thu, 26 Aug 2021 12:28:36 +0800 Subject: [PATCH] feat: upgrade to v21.02 Signed-off-by: Caoruihong Change-Id: I4405c75294b42f93414f64dccc50d7242abe4fdd --- Makefile | 2 +- README | 2 +- README.OpenSource | 4 +- config.mk.dist | 2 +- math/cosf.c | 2 +- math/erf.c | 244 ++++++++++++++++++++ math/erf_data.c | 85 +++++++ math/erff.c | 104 +++++++++ math/erff_data.c | 22 ++ math/exp.c | 2 +- math/exp2.c | 2 +- math/expf.c | 2 +- math/include/mathlib.h | 2 +- math/log.c | 2 +- math/log2.c | 2 +- math/logf.c | 2 +- math/logf_data.c | 2 +- math/math_config.h | 45 +++- math/math_errf.c | 16 +- math/pow.c | 2 +- math/powf.c | 2 +- math/powf_log2_data.c | 2 +- math/sincosf.c | 2 +- math/sincosf_data.c | 2 +- math/sinf.c | 2 +- math/test/mathbench.c | 4 +- math/test/mathtest.c | 2 +- math/test/rtest/dotest.c | 2 +- math/test/rtest/intern.h | 2 +- math/test/rtest/main.c | 2 +- math/test/rtest/random.c | 2 +- math/test/rtest/random.h | 2 +- math/test/rtest/semi.c | 2 +- math/test/rtest/semi.h | 2 +- math/test/rtest/types.h | 2 +- math/test/rtest/wrappers.c | 2 +- math/test/rtest/wrappers.h | 2 +- math/test/runulp.sh | 23 +- math/test/testcases/directed/cosf.tst | 2 +- math/test/testcases/directed/erf.tst | 17 ++ math/test/testcases/directed/erff.tst | 17 ++ math/test/testcases/directed/exp.tst | 2 +- math/test/testcases/directed/exp2.tst | 2 +- math/test/testcases/directed/exp2f.tst | 2 +- math/test/testcases/directed/expf.tst | 2 +- math/test/testcases/directed/log.tst | 2 +- math/test/testcases/directed/log2.tst | 2 +- math/test/testcases/directed/log2f.tst | 2 +- math/test/testcases/directed/logf.tst | 2 +- math/test/testcases/directed/pow.tst | 2 +- math/test/testcases/directed/powf.tst | 2 +- math/test/testcases/directed/sincosf.tst | 2 +- math/test/testcases/directed/sinf.tst | 2 +- math/test/testcases/random/double.tst | 2 +- math/test/testcases/random/float.tst | 2 +- math/test/ulp.c | 4 +- math/tools/plot.py | 0 math/tools/remez.jl | 2 +- math/v_math.h | 2 +- networking/test/chksum.c | 2 +- string/Dir.mk | 9 +- string/aarch64/__mtag_tag_region.S | 100 +++++++++ string/aarch64/__mtag_tag_zero_region.S | 100 +++++++++ string/aarch64/memchr-mte.S | 2 + string/aarch64/memchr-sve.S | 11 +- string/aarch64/memchr.S | 4 +- string/aarch64/memcmp-sve.S | 10 +- string/aarch64/memcmp.S | 5 +- string/aarch64/memcpy-advsimd.S | 10 +- string/aarch64/memcpy.S | 3 + string/aarch64/memrchr.S | 1 + string/aarch64/memset.S | 6 +- string/aarch64/strchr-mte.S | 1 + string/aarch64/strchr-sve.S | 8 +- string/aarch64/strchr.S | 3 +- string/aarch64/strchrnul-mte.S | 1 + string/aarch64/strchrnul-sve.S | 2 +- string/aarch64/strchrnul.S | 3 +- string/aarch64/strcmp-mte.S | 18 +- string/aarch64/strcmp-sve.S | 11 +- string/aarch64/strcmp.S | 2 + string/aarch64/strcpy-mte.S | 2 + string/aarch64/strcpy-sve.S | 9 +- string/aarch64/strcpy.S | 4 +- string/aarch64/strlen-mte.S | 1 + string/aarch64/strlen-sve.S | 11 +- string/aarch64/strlen.S | 275 +++++++++++------------ string/aarch64/strncmp-mte.S | 24 +- string/aarch64/strncmp-sve.S | 10 +- string/aarch64/strncmp.S | 18 +- string/aarch64/strnlen-sve.S | 11 +- string/aarch64/strnlen.S | 2 + string/aarch64/strrchr-mte.S | 1 + string/aarch64/strrchr-sve.S | 8 +- string/aarch64/strrchr.S | 1 + string/arm/memchr.S | 4 +- string/arm/memcpy.S | 26 +-- string/arm/memset.S | 3 +- string/arm/strcmp-armv6m.S | 2 +- string/arm/strcmp.S | 5 +- string/arm/strcpy.c | 12 +- string/arm/strlen-armv6t2.S | 4 +- string/asmdefs.h | 16 +- string/bench/memcpy.c | 34 +++ string/bench/strlen.c | 221 ++++++++++++++++++ string/include/stringlib.h | 11 +- string/test/__mtag_tag_region.c | 147 ++++++++++++ string/test/__mtag_tag_zero_region.c | 147 ++++++++++++ string/test/memcmp.c | 2 +- string/test/memcpy.c | 2 +- string/test/memmove.c | 2 +- string/test/memset.c | 2 +- string/test/strcmp.c | 2 +- string/test/strncmp.c | 2 +- string/test/strrchr.c | 4 +- 115 files changed, 1680 insertions(+), 334 deletions(-) create mode 100644 math/erf.c create mode 100644 math/erf_data.c create mode 100644 math/erff.c create mode 100644 math/erff_data.c mode change 100644 => 100755 math/test/runulp.sh create mode 100644 math/test/testcases/directed/erf.tst create mode 100644 math/test/testcases/directed/erff.tst mode change 100644 => 100755 math/tools/plot.py mode change 100644 => 100755 math/tools/remez.jl create mode 100644 string/aarch64/__mtag_tag_region.S create mode 100644 string/aarch64/__mtag_tag_zero_region.S create mode 100644 string/bench/strlen.c create mode 100644 string/test/__mtag_tag_region.c create mode 100644 string/test/__mtag_tag_zero_region.c diff --git a/Makefile b/Makefile index 89fc13b..169f89e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile - requires GNU make # -# Copyright (c) 2018-2019, Arm Limited. +# Copyright (c) 2018-2020, Arm Limited. # SPDX-License-Identifier: MIT srcdir = . diff --git a/README b/README index 6cde5cd..9e1a34f 100644 --- a/README +++ b/README @@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code to projects that require copyright assignment is possible. Regular quarterly releases are tagged as vYY.MM, the latest -release is v20.05. +release is v21.02. Source code layout: diff --git a/README.OpenSource b/README.OpenSource index 45a839b..0e874ba 100644 --- a/README.OpenSource +++ b/README.OpenSource @@ -3,9 +3,9 @@ "Name" : "optimized-routines", "License" : "MIT License", "License File" : "LICENSE", - "Version Number" : "v20.05", + "Version Number" : "v21.02", "Owner" : "zhaotianyu9@huawei.com", - "Upstream URL" : "https://www.mirbsd./mksh.ht://www.arm.com/;https://github.com/ARM-software/optimized-routines", + "Upstream URL" : "https://github.com/ARM-software/optimized-routines", "Description" : "Optimized implementations of various library functions for ARM architecture processors" } ] diff --git a/config.mk.dist b/config.mk.dist index 3e55c98..177e1ac 100644 --- a/config.mk.dist +++ b/config.mk.dist @@ -1,6 +1,6 @@ # Example config.mk # -# Copyright (c) 2018-2019, Arm Limited. +# Copyright (c) 2018-2020, Arm Limited. # SPDX-License-Identifier: MIT # Subprojects to build diff --git a/math/cosf.c b/math/cosf.c index 831b39e..f29f194 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -1,7 +1,7 @@ /* * Single-precision cos function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/erf.c b/math/erf.c new file mode 100644 index 0000000..12d7e51 --- /dev/null +++ b/math/erf.c @@ -0,0 +1,244 @@ +/* + * Double-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" +#include +#include + +#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 +#define C 0x1.b0ac16p-1 +#define PA __erf_data.erf_poly_A +#define NA __erf_data.erf_ratio_N_A +#define DA __erf_data.erf_ratio_D_A +#define NB __erf_data.erf_ratio_N_B +#define DB __erf_data.erf_ratio_D_B +#define PC __erf_data.erfc_poly_C +#define PD __erf_data.erfc_poly_D +#define PE __erf_data.erfc_poly_E +#define PF __erf_data.erfc_poly_F + +/* Top 32 bits of a double. */ +static inline uint32_t +top32 (double x) +{ + return asuint64 (x) >> 32; +} + +/* Fast erf implementation using a mix of + rational and polynomial approximations. + Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */ +double +erf (double x) +{ + /* Get top word and sign. */ + uint32_t ix = top32 (x); + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix >> 31; + + /* Normalized and subnormal cases */ + if (ia < 0x3feb0000) + { /* a = |x| < 0.84375. */ + + if (ia < 0x3e300000) + { /* a < 2^(-28). */ + if (ia < 0x00800000) + { /* a < 2^(-1015). */ + double y = fma (TwoOverSqrtPiMinusOne, x, x); + return check_uflow (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + double x2 = x * x; + + if (ia < 0x3fe00000) + { /* a < 0.5 - Use polynomial approximation. */ + double r1 = fma (x2, PA[1], PA[0]); + double r2 = fma (x2, PA[3], PA[2]); + double r3 = fma (x2, PA[5], PA[4]); + double r4 = fma (x2, PA[7], PA[6]); + double r5 = fma (x2, PA[9], PA[8]); + double x4 = x2 * x2; + double r = r5; + r = fma (x4, r, r4); + r = fma (x4, r, r3); + r = fma (x4, r, r2); + r = fma (x4, r, r1); + return fma (r, x, x); /* This fma is crucial for accuracy. */ + } + else + { /* 0.5 <= a < 0.84375 - Use rational approximation. */ + double x4, x8, r1n, r2n, r1d, r2d, r3d; + + r1n = fma (x2, NA[1], NA[0]); + x4 = x2 * x2; + r2n = fma (x2, NA[3], NA[2]); + x8 = x4 * x4; + r1d = fma (x2, DA[0], 1.0); + r2d = fma (x2, DA[2], DA[1]); + r3d = fma (x2, DA[4], DA[3]); + double P = r1n + x4 * r2n + x8 * NA[4]; + double Q = r1d + x4 * r2d + x8 * r3d; + return fma (P / Q, x, x); + } + } + else if (ia < 0x3ff40000) + { /* 0.84375 <= |x| < 1.25. */ + double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d; + double a = fabs (x) - 1.0; + r1n = fma (a, NB[1], NB[0]); + a2 = a * a; + r1d = fma (a, DB[0], 1.0); + a4 = a2 * a2; + r2n = fma (a, NB[3], NB[2]); + a6 = a4 * a2; + r2d = fma (a, DB[2], DB[1]); + r3n = fma (a, NB[5], NB[4]); + r3d = fma (a, DB[4], DB[3]); + r4n = NB[6]; + r4d = DB[5]; + double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n; + double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d; + if (sign) + return -C - P / Q; + else + return C + P / Q; + } + else if (ia < 0x40000000) + { /* 1.25 <= |x| < 2.0. */ + double a = fabs (x); + a = a - 1.25; + + double r1 = fma (a, PC[1], PC[0]); + double r2 = fma (a, PC[3], PC[2]); + double r3 = fma (a, PC[5], PC[4]); + double r4 = fma (a, PC[7], PC[6]); + double r5 = fma (a, PC[9], PC[8]); + double r6 = fma (a, PC[11], PC[10]); + double r7 = fma (a, PC[13], PC[12]); + double r8 = fma (a, PC[15], PC[14]); + + double a2 = a * a; + + double r = r8; + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x400a0000) + { /* 2 <= |x| < 3.25. */ + double a = fabs (x); + a = fma (0.5, a, -1.0); + + double r1 = fma (a, PD[1], PD[0]); + double r2 = fma (a, PD[3], PD[2]); + double r3 = fma (a, PD[5], PD[4]); + double r4 = fma (a, PD[7], PD[6]); + double r5 = fma (a, PD[9], PD[8]); + double r6 = fma (a, PD[11], PD[10]); + double r7 = fma (a, PD[13], PD[12]); + double r8 = fma (a, PD[15], PD[14]); + double r9 = fma (a, PD[17], PD[16]); + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x40100000) + { /* 3.25 <= |x| < 4.0. */ + double a = fabs (x); + a = a - 3.25; + + double r1 = fma (a, PE[1], PE[0]); + double r2 = fma (a, PE[3], PE[2]); + double r3 = fma (a, PE[5], PE[4]); + double r4 = fma (a, PE[7], PE[6]); + double r5 = fma (a, PE[9], PE[8]); + double r6 = fma (a, PE[11], PE[10]); + double r7 = fma (a, PE[13], PE[12]); + + double a2 = a * a; + + double r = r7; + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else if (ia < 0x4017a000) + { /* 4 <= |x| < 5.90625. */ + double a = fabs (x); + a = fma (0.5, a, -2.0); + + double r1 = fma (a, PF[1], PF[0]); + double r2 = fma (a, PF[3], PF[2]); + double r3 = fma (a, PF[5], PF[4]); + double r4 = fma (a, PF[7], PF[6]); + double r5 = fma (a, PF[9], PF[8]); + double r6 = fma (a, PF[11], PF[10]); + double r7 = fma (a, PF[13], PF[12]); + double r8 = fma (a, PF[15], PF[14]); + double r9 = PF[16]; + + double a2 = a * a; + + double r = r9; + r = fma (a2, r, r8); + r = fma (a2, r, r7); + r = fma (a2, r, r6); + r = fma (a2, r, r5); + r = fma (a2, r, r4); + r = fma (a2, r, r3); + r = fma (a2, r, r2); + r = fma (a2, r, r1); + + if (sign) + return -1.0 + r; + else + return 1.0 - r; + } + else + { + /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ + if (unlikely (ia >= 0x7ff00000)) + return (double) (1.0 - (sign << 1)) + 1.0 / x; + + if (sign) + return -1.0; + else + return 1.0; + } +} diff --git a/math/erf_data.c b/math/erf_data.c new file mode 100644 index 0000000..807875b --- /dev/null +++ b/math/erf_data.c @@ -0,0 +1,85 @@ +/* + * Shared data between erf and erfc. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* +Minimax approximation of erf +*/ +const struct erf_data __erf_data = { +.erf_poly_A = { +#if ERF_POLY_A_NCOEFFS == 10 +0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4, +-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11, +0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20, +-0x1.18c47fd143c5ep-23 +#endif +}, +/* Rational approximation on [0x1p-28, 0.84375] */ +.erf_ratio_N_A = { +0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6, +-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16 +}, +.erf_ratio_D_A = { +0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8, +0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18 +}, +/* Rational approximation on [0.84375, 1.25] */ +.erf_ratio_N_B = { +-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, +0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, +-0x1.1bf380a96073fp-9 +}, +.erf_ratio_D_B = { +0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, +0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 +}, +.erfc_poly_C = { +#if ERFC_POLY_C_NCOEFFS == 16 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */ +0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, +-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, +-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, +-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, +0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, +-0x1.578c9e375d37p-19 +#endif +}, +.erfc_poly_D = { +#if ERFC_POLY_D_NCOEFFS == 18 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */ +0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, +-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, +-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, +0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, +-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, +0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 +#endif +}, +.erfc_poly_E = { +#if ERFC_POLY_E_NCOEFFS == 14 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */ +0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, +-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, +0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, +-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, +-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 +#endif +}, +.erfc_poly_F = { +#if ERFC_POLY_F_NCOEFFS == 17 +/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */ +0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, +-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, +0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, +-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, +0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, +-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 +#endif +} +}; + diff --git a/math/erff.c b/math/erff.c new file mode 100644 index 0000000..a58e825 --- /dev/null +++ b/math/erff.c @@ -0,0 +1,104 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "math_config.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define A __erff_data.erff_poly_A +#define B __erff_data.erff_poly_B + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Efficient implementation of erff + using either a pure polynomial approximation or + the exponential of a polynomial. + Worst-case error is 1.09ulps at 0x1.c111acp-1. */ +float +erff (float x) +{ + float r, x2, u; + + /* Get top word. */ + uint32_t ix = asuint (x); + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Limit of both intervals is 0.875 for performance reasons but coefficients + computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy + from 0.94 to 1.1ulps. */ + if (ia12 < 0x3f6) + { /* a = |x| < 0.875. */ + + /* Tiny and subnormal cases. */ + if (unlikely (ia12 < 0x318)) + { /* |x| < 2^(-28). */ + if (unlikely (ia12 < 0x040)) + { /* |x| < 2^(-119). */ + float y = fmaf (TwoOverSqrtPiMinusOne, x, x); + return check_uflowf (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + x2 = x * x; + + /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */ + r = A[5]; + r = fmaf (r, x2, A[4]); + r = fmaf (r, x2, A[3]); + r = fmaf (r, x2, A[2]); + r = fmaf (r, x2, A[1]); + r = fmaf (r, x2, A[0]); + r = fmaf (r, x, x); + } + else if (ia12 < 0x408) + { /* |x| < 4.0 - Use a custom Estrin scheme. */ + + float a = fabsf (x); + /* Start with Estrin scheme on high order (small magnitude) coefficients. */ + r = fmaf (B[6], a, B[5]); + u = fmaf (B[4], a, B[3]); + x2 = x * x; + r = fmaf (r, x2, u); + /* Then switch to pure Horner scheme. */ + r = fmaf (r, a, B[2]); + r = fmaf (r, a, B[1]); + r = fmaf (r, a, B[0]); + r = fmaf (r, a, a); + /* Single precision exponential with ~0.5ulps, + ensures erff has max. rel. error + < 1ulp on [0.921875, 4.0], + < 1.1ulps on [0.875, 4.0]. */ + r = expf (-r); + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f + r; + else + r = 1.0f - r; + } + else + { /* |x| >= 4.0. */ + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia12 >= 0x7f8)) + return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; + + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f; + else + r = 1.0f; + } + return r; +} diff --git a/math/erff_data.c b/math/erff_data.c new file mode 100644 index 0000000..fa6b1ef --- /dev/null +++ b/math/erff_data.c @@ -0,0 +1,22 @@ +/* + * Data for approximation of erff. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "math_config.h" + +/* Minimax approximation of erff. */ +const struct erff_data __erff_data = { +.erff_poly_A = { +0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, +-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f +}, +.erff_poly_B = { +0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, +-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f, +0x1.222900p-16f +} +}; + diff --git a/math/exp.c b/math/exp.c index 1909b8e..7f5024c 100644 --- a/math/exp.c +++ b/math/exp.c @@ -1,7 +1,7 @@ /* * Double-precision e^x function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/exp2.c b/math/exp2.c index 47aa479..35ab39f 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -1,7 +1,7 @@ /* * Double-precision 2^x function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/expf.c b/math/expf.c index 0fe1f7d..9b2f0c3 100644 --- a/math/expf.c +++ b/math/expf.c @@ -1,7 +1,7 @@ /* * Single-precision e^x function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 4493008..279d829 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2015-2019, Arm Limited. + * Copyright (c) 2015-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/log.c b/math/log.c index b85d3ff..d3b7bc6 100644 --- a/math/log.c +++ b/math/log.c @@ -1,7 +1,7 @@ /* * Double-precision log(x) function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/log2.c b/math/log2.c index 804fb85..55102b7 100644 --- a/math/log2.c +++ b/math/log2.c @@ -1,7 +1,7 @@ /* * Double-precision log2(x) function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/logf.c b/math/logf.c index ee3120a..cfbaee1 100644 --- a/math/logf.c +++ b/math/logf.c @@ -1,7 +1,7 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/logf_data.c b/math/logf_data.c index 53c5f62..e8973ce 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -1,7 +1,7 @@ /* * Data definition for logf. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/math_config.h b/math/math_config.h index 85fc584..e851043 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -1,7 +1,7 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -298,6 +298,24 @@ check_uflow (double x) return WANT_ERRNO ? __math_check_uflow (x) : x; } +/* Check if the result overflowed to infinity. */ +HIDDEN float __math_check_oflowf (float); +/* Check if the result underflowed to 0. */ +HIDDEN float __math_check_uflowf (float); + +/* Check if the result overflowed to infinity. */ +static inline float +check_oflowf (float x) +{ + return WANT_ERRNO ? __math_check_oflowf (x) : x; +} + +/* Check if the result underflowed to 0. */ +static inline float +check_uflowf (float x) +{ + return WANT_ERRNO ? __math_check_uflowf (x) : x; +} /* Shared between expf, exp2f and powf. */ #define EXP2F_TABLE_BITS 5 @@ -416,4 +434,29 @@ extern const struct pow_log_data struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS]; } __pow_log_data HIDDEN; +extern const struct erff_data +{ + float erff_poly_A[6]; + float erff_poly_B[7]; +} __erff_data HIDDEN; + +#define ERF_POLY_A_ORDER 19 +#define ERF_POLY_A_NCOEFFS 10 +#define ERFC_POLY_C_NCOEFFS 16 +#define ERFC_POLY_D_NCOEFFS 18 +#define ERFC_POLY_E_NCOEFFS 14 +#define ERFC_POLY_F_NCOEFFS 17 +extern const struct erf_data +{ + double erf_poly_A[ERF_POLY_A_NCOEFFS]; + double erf_ratio_N_A[5]; + double erf_ratio_D_A[5]; + double erf_ratio_N_B[7]; + double erf_ratio_D_B[6]; + double erfc_poly_C[ERFC_POLY_C_NCOEFFS]; + double erfc_poly_D[ERFC_POLY_D_NCOEFFS]; + double erfc_poly_E[ERFC_POLY_E_NCOEFFS]; + double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; +} __erf_data HIDDEN; + #endif diff --git a/math/math_errf.c b/math/math_errf.c index 07154c5..d5350b8 100644 --- a/math/math_errf.c +++ b/math/math_errf.c @@ -1,7 +1,7 @@ /* * Single-precision math error handling. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -64,3 +64,17 @@ __math_invalidf (float x) float y = (x - x) / (x - x); return isnan (x) ? y : with_errnof (y, EDOM); } + +/* Check result and set errno if necessary. */ + +HIDDEN float +__math_check_uflowf (float y) +{ + return y == 0.0f ? with_errnof (y, ERANGE) : y; +} + +HIDDEN float +__math_check_oflowf (float y) +{ + return isinf (y) ? with_errnof (y, ERANGE) : y; +} diff --git a/math/pow.c b/math/pow.c index ced7c4f..86842c6 100644 --- a/math/pow.c +++ b/math/pow.c @@ -1,7 +1,7 @@ /* * Double-precision x^y function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/powf.c b/math/powf.c index 1534a09..6ba45d3 100644 --- a/math/powf.c +++ b/math/powf.c @@ -1,7 +1,7 @@ /* * Single-precision pow function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c index b9fbdc4..97e0d98 100644 --- a/math/powf_log2_data.c +++ b/math/powf_log2_data.c @@ -1,7 +1,7 @@ /* * Data definition for powf. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sincosf.c b/math/sincosf.c index e6cd41e..9746f1c 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -1,7 +1,7 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sincosf_data.c b/math/sincosf_data.c index 5d0b58e..ab4ac47 100644 --- a/math/sincosf_data.c +++ b/math/sincosf_data.c @@ -1,7 +1,7 @@ /* * Data definition for sinf, cosf and sincosf. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/sinf.c b/math/sinf.c index 770b294..ddbc1da 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -1,7 +1,7 @@ /* * Single-precision sin function. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 33ceda3..0c17826 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -1,7 +1,7 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -248,6 +248,7 @@ D (log2, 0.999, 1.001) {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) F (dummyf, 1.0, 2.0) F (expf, -9.9, 9.9) @@ -275,6 +276,7 @@ F (cosf, -3.1, 3.1) F (cosf, 3.3, 33.3) F (cosf, 100, 1000) F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) #if WANT_VMATH D (__s_sin, -3.1, 3.1) D (__s_cos, -3.1, 3.1) diff --git a/math/test/mathtest.c b/math/test/mathtest.c index 2ff8c3f..3108967 100644 --- a/math/test/mathtest.c +++ b/math/test/mathtest.c @@ -1,7 +1,7 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2018, Arm Limited. + * Copyright (c) 1998-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c index f416477..6be79e1 100644 --- a/math/test/rtest/dotest.c +++ b/math/test/rtest/dotest.c @@ -1,7 +1,7 @@ /* * dotest.c - actually generate mathlib test cases * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h index af574b0..12a9c74 100644 --- a/math/test/rtest/intern.h +++ b/math/test/rtest/intern.h @@ -1,7 +1,7 @@ /* * intern.h * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c index e94e455..0d8ead8 100644 --- a/math/test/rtest/main.c +++ b/math/test/rtest/main.c @@ -1,7 +1,7 @@ /* * main.c * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c index e97a8c6..5612396 100644 --- a/math/test/rtest/random.c +++ b/math/test/rtest/random.c @@ -1,7 +1,7 @@ /* * random.c - random number generator for producing mathlib test cases * - * Copyright (c) 1998-2018, Arm Limited. + * Copyright (c) 1998-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h index c1ce956..b4b22df 100644 --- a/math/test/rtest/random.h +++ b/math/test/rtest/random.h @@ -1,7 +1,7 @@ /* * random.h - header for random.c * - * Copyright (c) 2009-2018, Arm Limited. + * Copyright (c) 2009-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c index 938dc3a..c9f0daf 100644 --- a/math/test/rtest/semi.c +++ b/math/test/rtest/semi.c @@ -1,7 +1,7 @@ /* * semi.c: test implementations of mathlib seminumerical functions * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h index da473a2..17dc415 100644 --- a/math/test/rtest/semi.h +++ b/math/test/rtest/semi.h @@ -1,7 +1,7 @@ /* * semi.h: header for semi.c * - * Copyright (c) 1999-2018, Arm Limited. + * Copyright (c) 1999-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h index 1a76c2e..53cd557 100644 --- a/math/test/rtest/types.h +++ b/math/test/rtest/types.h @@ -1,7 +1,7 @@ /* * types.h * - * Copyright (c) 2005-2018, Arm Limited. + * Copyright (c) 2005-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c index acaf671..de45ac5 100644 --- a/math/test/rtest/wrappers.c +++ b/math/test/rtest/wrappers.c @@ -1,7 +1,7 @@ /* * wrappers.c - wrappers to modify output of MPFR/MPC test functions * - * Copyright (c) 2014-2018, Arm Limited. + * Copyright (c) 2014-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h index 5804935..7b09c85 100644 --- a/math/test/rtest/wrappers.h +++ b/math/test/rtest/wrappers.h @@ -1,7 +1,7 @@ /* * wrappers.h - wrappers to modify output of MPFR/MPC test functions * - * Copyright (c) 2014-2018, Arm Limited. + * Copyright (c) 2014-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/math/test/runulp.sh b/math/test/runulp.sh old mode 100644 new mode 100755 index a8c391b..0190d9a --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -2,7 +2,7 @@ # ULP error check script. # -# Copyright (c) 2019, Arm Limited. +# Copyright (c) 2019-2020, Arm Limited. # SPDX-License-Identifier: MIT #set -x @@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=1.0 +Ldir=0.9 +t erf 0 0xffff000000000000 10000 +t erf 0x1p-1022 0x1p-26 40000 +t erf -0x1p-1022 -0x1p-26 40000 +t erf 0x1p-26 0x1p3 40000 +t erf -0x1p-26 -0x1p3 40000 +t erf 0 inf 40000 +Ldir=0.5 + L=0.01 t expf 0 0xffff0000 10000 t expf 0x1p-14 0x1p8 50000 @@ -119,6 +129,17 @@ t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 + +L=0.6 +Ldir=0.9 +t erff 0 0xffff0000 10000 +t erff 0x1p-127 0x1p-26 40000 +t erff -0x1p-127 -0x1p-26 40000 +t erff 0x1p-26 0x1p3 40000 +t erff -0x1p-26 -0x1p3 40000 +t erff 0 inf 40000 +Ldir=0.5 + done # vector functions diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst index 5dc0994..7916044 100644 --- a/math/test/testcases/directed/cosf.tst +++ b/math/test/testcases/directed/cosf.tst @@ -1,6 +1,6 @@ ; cosf.tst - Directed test cases for SP cosine ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=cosf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst new file mode 100644 index 0000000..7fa4d18 --- /dev/null +++ b/math/test/testcases/directed/erf.tst @@ -0,0 +1,17 @@ +; erf.tst - Directed test cases for erf +; +; Copyright (c) 2007-2020, Arm Limited. +; SPDX-License-Identifier: MIT + +func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0 +func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0 +func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE +func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE +func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux +func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux +func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0 +func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0 diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst new file mode 100644 index 0000000..d05b7b1 --- /dev/null +++ b/math/test/testcases/directed/erff.tst @@ -0,0 +1,17 @@ +; erff.tst +; +; Copyright (c) 2007-2020, Arm Limited. +; SPDX-License-Identifier: MIT + +func=erff op1=7fc00001 result=7fc00001 errno=0 +func=erff op1=ffc00001 result=7fc00001 errno=0 +func=erff op1=7f800001 result=7fc00001 errno=0 status=i +func=erff op1=ff800001 result=7fc00001 errno=0 status=i +func=erff op1=7f800000 result=3f800000 errno=0 +func=erff op1=ff800000 result=bf800000 errno=0 +func=erff op1=00000000 result=00000000 errno=ERANGE +func=erff op1=80000000 result=80000000 errno=ERANGE +func=erff op1=00000001 result=00000001 errno=0 status=ux +func=erff op1=80000001 result=80000001 errno=0 status=ux +func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0 +func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0 diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst index addfc0a..85d556c 100644 --- a/math/test/testcases/directed/exp.tst +++ b/math/test/testcases/directed/exp.tst @@ -1,6 +1,6 @@ ; Directed test cases for exp ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst index 04a5a50..fa56c9f 100644 --- a/math/test/testcases/directed/exp2.tst +++ b/math/test/testcases/directed/exp2.tst @@ -1,6 +1,6 @@ ; Directed test cases for exp2 ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst index 2b6a9b5..38cfc3f 100644 --- a/math/test/testcases/directed/exp2f.tst +++ b/math/test/testcases/directed/exp2f.tst @@ -1,6 +1,6 @@ ; exp2f.tst - Directed test cases for exp2f ; -; Copyright (c) 2017-2018, Arm Limited. +; Copyright (c) 2017-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=exp2f op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst index 74664c7..ff0f671 100644 --- a/math/test/testcases/directed/expf.tst +++ b/math/test/testcases/directed/expf.tst @@ -1,6 +1,6 @@ ; expf.tst - Directed test cases for expf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=expf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst index eeb762c..a0aa398 100644 --- a/math/test/testcases/directed/log.tst +++ b/math/test/testcases/directed/log.tst @@ -1,6 +1,6 @@ ; Directed test cases for log ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst index e0765d8..ff1286c 100644 --- a/math/test/testcases/directed/log2.tst +++ b/math/test/testcases/directed/log2.tst @@ -1,6 +1,6 @@ ; Directed test cases for log2 ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst index 8d685ba..5832c4f 100644 --- a/math/test/testcases/directed/log2f.tst +++ b/math/test/testcases/directed/log2f.tst @@ -1,6 +1,6 @@ ; log2f.tst - Directed test cases for log2f ; -; Copyright (c) 2017-2018, Arm Limited. +; Copyright (c) 2017-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=log2f op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst index 7ccc873..6e68a36 100644 --- a/math/test/testcases/directed/logf.tst +++ b/math/test/testcases/directed/logf.tst @@ -1,6 +1,6 @@ ; logf.tst - Directed test cases for logf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=logf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst index a4c42be..1966581 100644 --- a/math/test/testcases/directed/pow.tst +++ b/math/test/testcases/directed/pow.tst @@ -1,6 +1,6 @@ ; Directed test cases for pow ; -; Copyright (c) 2018, Arm Limited. +; Copyright (c) 2018-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst index efd1dd5..3fa8b11 100644 --- a/math/test/testcases/directed/powf.tst +++ b/math/test/testcases/directed/powf.tst @@ -1,6 +1,6 @@ ; powf.tst - Directed test cases for powf ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst index b4b2526..4b33d22 100644 --- a/math/test/testcases/directed/sincosf.tst +++ b/math/test/testcases/directed/sincosf.tst @@ -1,6 +1,6 @@ ; Directed test cases for SP sincos ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst index 13cfdca..ded80b1 100644 --- a/math/test/testcases/directed/sinf.tst +++ b/math/test/testcases/directed/sinf.tst @@ -1,6 +1,6 @@ ; sinf.tst - Directed test cases for SP sine ; -; Copyright (c) 2007-2018, Arm Limited. +; Copyright (c) 2007-2019, Arm Limited. ; SPDX-License-Identifier: MIT diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst index c37e837..c24ff80 100644 --- a/math/test/testcases/random/double.tst +++ b/math/test/testcases/random/double.tst @@ -1,6 +1,6 @@ !! double.tst - Random test case specification for DP functions !! -!! Copyright (c) 1999-2018, Arm Limited. +!! Copyright (c) 1999-2019, Arm Limited. !! SPDX-License-Identifier: MIT test exp 10000 diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst index baf62b9..d02a227 100644 --- a/math/test/testcases/random/float.tst +++ b/math/test/testcases/random/float.tst @@ -1,6 +1,6 @@ !! single.tst - Random test case specification for SP functions !! -!! Copyright (c) 1999-2018, Arm Limited. +!! Copyright (c) 1999-2019, Arm Limited. !! SPDX-License-Identifier: MIT test sinf 10000 diff --git a/math/test/ulp.c b/math/test/ulp.c index 371567a..51479b8 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -1,7 +1,7 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -331,11 +331,13 @@ static const struct fun fun[] = { F1 (log) F1 (log2) F2 (pow) + F1 (erf) D1 (exp) D1 (exp2) D1 (log) D1 (log2) D2 (pow) + D1 (erf) #if WANT_VMATH F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) diff --git a/math/tools/plot.py b/math/tools/plot.py old mode 100644 new mode 100755 diff --git a/math/tools/remez.jl b/math/tools/remez.jl old mode 100644 new mode 100755 index f479fc5..2ff436f --- a/math/tools/remez.jl +++ b/math/tools/remez.jl @@ -3,7 +3,7 @@ # remez.jl - implementation of the Remez algorithm for polynomial approximation # -# Copyright (c) 2015-2018, Arm Limited. +# Copyright (c) 2015-2019, Arm Limited. # SPDX-License-Identifier: MIT import Base.\ diff --git a/math/v_math.h b/math/v_math.h index 3db22e5..f2cc467 100644 --- a/math/v_math.h +++ b/math/v_math.h @@ -1,7 +1,7 @@ /* * Vector math abstractions. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/networking/test/chksum.c b/networking/test/chksum.c index 50722a4..41b9812 100644 --- a/networking/test/chksum.c +++ b/networking/test/chksum.c @@ -1,7 +1,7 @@ /* * Ones' complement checksum test & benchmark * - * Copyright 2016-2020 ARM Limited + * Copyright (c) 2016-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/Dir.mk b/string/Dir.mk index ae7c673..cf3453f 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -1,6 +1,6 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2020, Arm Limited. +# Copyright (c) 2019-2021, Arm Limited. # SPDX-License-Identifier: MIT S := $(srcdir)/string @@ -29,6 +29,8 @@ string-tests := \ build/bin/test/memchr \ build/bin/test/memrchr \ build/bin/test/memcmp \ + build/bin/test/__mtag_tag_region \ + build/bin/test/__mtag_tag_zero_region \ build/bin/test/strcpy \ build/bin/test/stpcpy \ build/bin/test/strcmp \ @@ -39,7 +41,9 @@ string-tests := \ build/bin/test/strnlen \ build/bin/test/strncmp -string-benches := build/bin/bench/memcpy +string-benches := \ + build/bin/bench/memcpy \ + build/bin/bench/strlen string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs))) @@ -95,6 +99,7 @@ check-string: $(string-tests-out) ! grep FAIL $^ bench-string: $(string-benches) + $(EMULATOR) build/bin/bench/strlen $(EMULATOR) build/bin/bench/memcpy install-string: \ diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S new file mode 100644 index 0000000..84339f7 --- /dev/null +++ b/string/aarch64/__mtag_tag_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_region - tag memory + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stg dstin, [dstin] + stg dstin, [tmp] + stg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + st2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + st2g dstin, [dstin] + st2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gva, dst + subs count, count, 64 + b.hi L(zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + st2g dstin, [dst, 32] + st2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + st2g dstin, [dstend, -64] + st2g dstin, [dstend, -32] + ret + +END (__mtag_tag_region) +#endif diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S new file mode 100644 index 0000000..f58364c --- /dev/null +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -0,0 +1,100 @@ +/* + * __mtag_tag_zero_region - tag memory and fill it with zero bytes + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, MTE, LP64 ABI. + * + * Interface contract: + * Address is 16 byte aligned and size is multiple of 16. + * Returns the passed pointer. + * The memory region may remain untagged if tagging is not enabled. + */ + +#include "../asmdefs.h" + +#if __ARM_FEATURE_MEMORY_TAGGING + +#define dstin x0 +#define count x1 +#define dst x2 +#define dstend x3 +#define tmp x4 +#define zva_val x4 + +ENTRY (__mtag_tag_zero_region) + PTR_ARG (0) + SIZE_ARG (1) + + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + + tbnz count, 6, L(set96) + + /* Set 0, 16, 32, or 48 bytes. */ + lsr tmp, count, 5 + add tmp, dstin, tmp, lsl 4 + cbz count, L(end) + stzg dstin, [dstin] + stzg dstin, [tmp] + stzg dstin, [dstend, -16] +L(end): + ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + stz2g dstin, [dstend, -32] + ret + + .p2align 4 + /* Size is > 96 bytes. */ +L(set_long): + cmp count, 160 + b.lo L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + stz2g dstin, [dstin] + stz2g dstin, [dstin, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc gzva, dst + subs count, count, 64 + b.hi L(zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +L(no_zva): + sub dst, dstin, 32 /* Dst is biased by -32. */ + sub count, count, 64 /* Adjust count for loop. */ +L(no_zva_loop): + stz2g dstin, [dst, 32] + stz2g dstin, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stz2g dstin, [dstend, -64] + stz2g dstin, [dstend, -32] + ret + +END (__mtag_tag_zero_region) +#endif diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S index 31ad050..c2e967d 100644 --- a/string/aarch64/memchr-mte.S +++ b/string/aarch64/memchr-mte.S @@ -44,6 +44,8 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) + PTR_ARG (0) + SIZE_ARG (2) bic src, srcin, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S index 4a5c726..c22e659 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/memchr-sve.S @@ -1,7 +1,7 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,15 +14,14 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN(__memchr_aarch64_sve, 4) +ENTRY (__memchr_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (2) dup z1.b, w1 /* duplicate c to a vector */ setffr /* initialize FFR */ mov x3, 0 /* initialize off */ - nop + .p2align 4 0: whilelo p1.b, x3, x2 /* make sure off < max */ b.none 9f diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index dfba79f..353f0d1 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -1,7 +1,7 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -47,6 +47,8 @@ */ ENTRY (__memchr_aarch64) + PTR_ARG (0) + SIZE_ARG (2) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S index 8a0a2ea..78c5eca 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/memcmp-sve.S @@ -1,7 +1,7 @@ /* * memcmp - compare memory * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,10 +14,10 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__memcmp_aarch64_sve, 4) +ENTRY (__memcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) mov x3, 0 /* initialize off */ 0: whilelo p0.b, x3, x2 /* while off < max */ diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index dac9147..3b10266 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -1,6 +1,6 @@ /* memcmp - compare memory * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -28,6 +28,9 @@ #define tmp2 x8 ENTRY (__memcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) subs limit, limit, 8 b.lo L(less8) diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index 3004179..f97f2c3 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -52,6 +52,9 @@ ENTRY_ALIAS (__memmove_aarch64_simd) ENTRY (__memcpy_aarch64_simd) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) add srcend, src, count add dstend, dstin, count cmp count, 128 @@ -179,12 +182,13 @@ L(copy_long_backwards): b.ls L(copy64_from_start) L(loop64_backwards): - stp A_q, B_q, [dstend, -32] + str B_q, [dstend, -16] + str A_q, [dstend, -32] ldp A_q, B_q, [srcend, -96] - stp C_q, D_q, [dstend, -64] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! ldp C_q, D_q, [srcend, -128] sub srcend, srcend, 64 - sub dstend, dstend, 64 subs count, count, 64 b.hi L(loop64_backwards) diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 157bb0d..dd254f6 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -55,6 +55,9 @@ ENTRY_ALIAS (__memmove_aarch64) ENTRY (__memcpy_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) add srcend, src, count add dstend, dstin, count cmp count, 128 diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S index ad42b49..7b4be84 100644 --- a/string/aarch64/memrchr.S +++ b/string/aarch64/memrchr.S @@ -46,6 +46,7 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__memrchr_aarch64) + PTR_ARG (0) add end, srcin, cntin sub endm1, end, 1 bic src, endm1, 15 diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 27743f1..9fcd975 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,7 +1,7 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2020, Arm Limited. + * Copyright (c) 2012-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -22,6 +22,8 @@ #define zva_val x5 ENTRY (__memset_aarch64) + PTR_ARG (0) + SIZE_ARG (2) dup v0.16B, valw add dstend, dstin, count @@ -37,7 +39,7 @@ ENTRY (__memset_aarch64) str val, [dstin] str val, [dstend, -8] ret - nop + .p2align 4 1: tbz count, 2, 2f str valw, [dstin] str valw, [dstend, -4] diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index 577752e..dcb0e46 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -43,6 +43,7 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strchr_aarch64_mte) + PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S index 495beda..13ba9f4 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/strchr-sve.S @@ -1,7 +1,7 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,9 +14,6 @@ * SVE Available. */ - .arch armv8-a+sve - .text - /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */ #ifdef BUILD_STRCHRNUL #define FUNC __strchrnul_aarch64_sve @@ -24,7 +21,8 @@ #define FUNC __strchr_aarch64_sve #endif -ENTRY_ALIGN (FUNC, 4) +ENTRY (FUNC) + PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 8d8e3fc..1063cbf 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -1,7 +1,7 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -51,6 +51,7 @@ /* Locals and temporaries. */ ENTRY (__strchr_aarch64) + PTR_ARG (0) /* Magic constant 0xc0300c03 to allow us to identify which lane matches the requested byte. Even bits are set if the character matches, odd bits if either the char is NUL or matches. */ diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S index 0dbf0dc..1b0d0a6 100644 --- a/string/aarch64/strchrnul-mte.S +++ b/string/aarch64/strchrnul-mte.S @@ -41,6 +41,7 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) + PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S index 5140e59..428ff1a 100644 --- a/string/aarch64/strchrnul-sve.S +++ b/string/aarch64/strchrnul-sve.S @@ -1,7 +1,7 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2019, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 45be15c..a4230d9 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -1,7 +1,7 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -47,6 +47,7 @@ /* Locals and temporaries. */ ENTRY (__strchrnul_aarch64) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S index 8f2abc4..12d1a6b 100644 --- a/string/aarch64/strcmp-mte.S +++ b/string/aarch64/strcmp-mte.S @@ -51,6 +51,8 @@ ENTRY (__strcmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) sub off2, src2, src1 mov zeroones, REP8_01 and tmp, src1, 7 @@ -99,6 +101,8 @@ L(end): sub result, data1, data2, lsr 56 ret + .p2align 4 + L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off @@ -127,17 +131,18 @@ L(do_misaligned): b.ne L(do_misaligned) L(src1_aligned): - lsl shift, src2, 3 + neg shift, src2, lsl 3 bic src2, src2, 7 ldr data3, [src2], 8 #ifdef __AARCH64EB__ rev data3, data3 #endif + lsr tmp, zeroones, shift + orr data3, data3, tmp sub has_nul, data3, zeroones orr tmp, data3, REP8_7f - bic has_nul, has_nul, tmp - lsr tmp, has_nul, shift - cbnz tmp, L(tail) + bics has_nul, has_nul, tmp + b.ne L(tail) sub off1, src2, src1 @@ -156,8 +161,7 @@ L(loop_unaligned): ccmp data1, data2, 0, eq b.eq L(loop_unaligned) - neg tmp, shift - lsl tmp, has_nul, tmp + lsl tmp, has_nul, shift #ifdef __AARCH64EB__ rev tmp, tmp #endif @@ -166,6 +170,7 @@ L(loop_unaligned): cbnz syndrome, L(end) L(tail): ldr data1, [src1] + neg shift, shift lsr data2, data3, shift lsr has_nul, has_nul, shift #ifdef __AARCH64EB__ @@ -180,6 +185,5 @@ L(done): sub result, data1, data2 ret - END (__strcmp_aarch64_mte) diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S index dc5b769..e6d2da5 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/strcmp-sve.S @@ -1,7 +1,7 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,16 +14,15 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__strcmp_aarch64_sve, 4) +ENTRY (__strcmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) setffr /* initialize FFR */ ptrue p1.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ - nop /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 0: ldff1b z0.b, p1/z, [x0, x2] ldff1b z1.b, p1/z, [x1, x2] rdffrs p0.b, p1/z diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index ee95958..7714ebf 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -37,6 +37,8 @@ /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S index 7c8629e..88c222d 100644 --- a/string/aarch64/strcpy-mte.S +++ b/string/aarch64/strcpy-mte.S @@ -55,6 +55,8 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) bic src, srcin, 15 mov wtmp, 0xf00f ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S index a785d45..f515462 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/strcpy-sve.S @@ -1,7 +1,7 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,9 +14,6 @@ * SVE Available. */ - .arch armv8-a+sve - .text - /* To build as stpcpy, define BUILD_STPCPY before compiling this file. */ #ifdef BUILD_STPCPY #define FUNC __stpcpy_aarch64_sve @@ -24,7 +21,9 @@ #define FUNC __strcpy_aarch64_sve #endif -ENTRY_ALIGN (FUNC, 4) +ENTRY (FUNC) + PTR_ARG (0) + PTR_ARG (1) setffr /* initialize FFR */ ptrue p2.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index a6090c8..6e9ed42 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -1,7 +1,7 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2019, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -80,6 +80,8 @@ #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) /* For moderately short strings, the fastest way to do the copy is to calculate the length of the string in the same way as strlen, then essentially do a memcpy of the result. This avoids the need for diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 6a99340..7cf41d5 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -39,6 +39,7 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strlen_aarch64_mte) + PTR_ARG (0) bic src, srcin, 15 mov wtmp, 0xf00f ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S index 9a9a359..2392493 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/strlen-sve.S @@ -1,7 +1,7 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,18 +14,15 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__strlen_aarch64_sve, 4) +ENTRY (__strlen_aarch64_sve) + PTR_ARG (0) setffr /* initialize FFR */ ptrue p2.b /* all ones; loop invariant */ mov x1, 0 /* initialize length */ - nop /* Read a vector's worth of bytes, stopping on first fault. */ + .p2align 4 0: ldff1b z0.b, p2/z, [x0, x1] - nop rdffrs p0.b, p2/z b.nlast 2f diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 3aa444b..a1b164a 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,84 +1,88 @@ /* - * strlen - calculate the length of a string + * strlen - calculate the length of a string. * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. */ #include "../asmdefs.h" -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ #ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 +# define MIN_PAGE_SIZE 32 #else # define MIN_PAGE_SIZE 4096 #endif - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - __strlen_aarch64 will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ ENTRY (__strlen_aarch64) + PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ ldp data1, data2, [srcin] + mov zeroones, REP8_01 + #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. @@ -94,114 +98,103 @@ ENTRY (__strlen_aarch64) bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) + b.eq L(bytes16_31) - /* Enter with C = has_nul1 == 0. */ + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc mov len, 8 rev has_nul1, has_nul1 - clz tmp1, has_nul1 csel len, xzr, len, cc + clz tmp1, has_nul1 add len, len, tmp1, lsr 3 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -L(page_cross_entry): - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] + .p2align 3 + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) + b.eq L(loop_entry) - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin + mov len, 24 rev has_nul1, has_nul1 - add tmp2, len, 8 + mov tmp3, 16 clz tmp1, has_nul1 - csel len, len, tmp2, cc + csel len, tmp3, len, cc add len, len, tmp1, lsr 3 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) +L(loop_entry): + bic src, srcin, 31 - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + tst synd, 0xffffffff + b.ne 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ #ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0xf0 #else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0x0f, lsl 8 #endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret -END (__strlen_aarch64) + .p2align 4 +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (__strlen_aarch64) diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S index b7e3914..c9d6fc8 100644 --- a/string/aarch64/strncmp-mte.S +++ b/string/aarch64/strncmp-mte.S @@ -1,7 +1,7 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2020, Arm Limited. + * Copyright (c) 2013-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -53,12 +53,10 @@ #define LS_BK lsl #endif - .text - .p2align 6 - .rep 9 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY_ALIGN (__strncmp_aarch64_mte, 0) +ENTRY (__strncmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -70,7 +68,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ + .p2align 4 L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -83,7 +81,7 @@ L(start_realigned): bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ + /* End of main loop */ L(full_check): #ifndef __AARCH64EB__ @@ -167,15 +165,15 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - and count, count, #0x3f LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): cmp limit, #16 diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S index fdbe7ae..234190e 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/strncmp-sve.S @@ -1,7 +1,7 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018, Arm Limited. + * Copyright (c) 2018-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,10 +14,10 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__strncmp_aarch64_sve, 4) +ENTRY (__strncmp_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) setffr /* initialize FFR */ mov x3, 0 /* initialize off */ diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index 584c54a..738b653 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -1,7 +1,7 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -40,12 +40,10 @@ #define endloop x15 #define count mask - .text - .p2align 6 - .rep 6 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY_ALIGN (__strncmp_aarch64, 0) +ENTRY (__strncmp_aarch64) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -60,7 +58,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ + .p2align 4 L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -73,7 +71,7 @@ L(start_realigned): bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ + /* End of main loop */ /* Not reached the limit, must have found the end or a diff. */ tbz limit_wd, #63, L(not_limit) @@ -178,7 +176,7 @@ L(mutual_align): add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): cmp limit, #16 diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S index 5ad40d3..5b9ebf7 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/strnlen-sve.S @@ -1,7 +1,7 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,10 +14,9 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__strnlen_aarch64_sve, 4) +ENTRY (__strnlen_aarch64_sve) + PTR_ARG (0) + SIZE_ARG (1) setffr /* initialize FFR */ mov x2, 0 /* initialize len */ b 1f @@ -66,7 +65,7 @@ ENTRY_ALIGN (__strnlen_aarch64_sve, 4) b 1b /* End of count. Return max. */ -9: mov x0, x2 +9: mov x0, x1 ret END (__strnlen_aarch64_sve) diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index 4852edc..48d2495 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -42,6 +42,8 @@ string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strnlen_aarch64) + PTR_ARG (0) + SIZE_ARG (1) bic src, srcin, 15 mov wtmp, 0xf00f cbz cntin, L(nomatch) diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S index 5a409b9..1e4fb1a 100644 --- a/string/aarch64/strrchr-mte.S +++ b/string/aarch64/strrchr-mte.S @@ -44,6 +44,7 @@ if the relevant byte matched the NUL end of string. */ ENTRY (__strrchr_aarch64_mte) + PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin mov wtmp, 0x3003 diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S index dbb9bfd..d36d69a 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/strrchr-sve.S @@ -1,7 +1,7 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -14,10 +14,8 @@ * SVE Available. */ - .arch armv8-a+sve - .text - -ENTRY_ALIGN (__strrchr_aarch64_sve, 4) +ENTRY (__strrchr_aarch64_sve) + PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index f3d22d4..56185ff 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -55,6 +55,7 @@ identify exactly which byte is causing the termination, and why. */ ENTRY (__strrchr_aarch64) + PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ diff --git a/string/arm/memchr.S b/string/arm/memchr.S index 565708c..3f1ac4d 100644 --- a/string/arm/memchr.S +++ b/string/arm/memchr.S @@ -1,7 +1,7 @@ /* * memchr - scan memory for a character * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -26,13 +26,11 @@ .arch armv7-a @ this lets us check a flag in a 00/ff byte easily in either endianness -#define __memchr_arm memchr #ifdef __ARMEB__ #define CHARTSTMASK(c) 1<<(31-(c*8)) #else #define CHARTSTMASK(c) 1<<(c*8) #endif - .text .thumb @ --------------------------------------------------------------------------- diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S index 46492b5..86e6493 100644 --- a/string/arm/memcpy.S +++ b/string/arm/memcpy.S @@ -1,7 +1,7 @@ /* * memcpy - copy memory area * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2013-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -16,8 +16,8 @@ Unaligned accesses */ + #include "../asmdefs.h" -#define __memcpy_arm memcpy .syntax unified /* This implementation requires ARM state. */ @@ -124,7 +124,7 @@ ENTRY (__memcpy_arm) mov dst, dstin /* Preserve dstin, we need to return it. */ cmp count, #64 - bge L(cpy_not_short) + bhs L(cpy_not_short) /* Deal with small copies quickly by dropping straight into the exit block. */ @@ -239,10 +239,10 @@ L(cpy_not_short): 1: subs tmp2, count, #64 /* Use tmp2 for count. */ - blt L(tail63aligned) + blo L(tail63aligned) cmp tmp2, #512 - bge L(cpy_body_long) + bhs L(cpy_body_long) L(cpy_body_medium): /* Count in tmp2. */ #ifdef USE_VFP @@ -266,7 +266,7 @@ L(cpy_body_medium): /* Count in tmp2. */ add src, src, #64 vstr d1, [dst, #56] add dst, dst, #64 - bge 1b + bhs 1b tst tmp2, #0x3f beq L(done) @@ -312,7 +312,7 @@ L(tail63aligned): /* Count in tmp2. */ ldrd A_l, A_h, [src, #64]! strd A_l, A_h, [dst, #64]! subs tmp2, tmp2, #64 - bge 1b + bhs 1b tst tmp2, #0x3f bne 1f ldr tmp2,[sp], #FRAME_SIZE @@ -383,7 +383,7 @@ L(cpy_body_long): /* Count in tmp2. */ add src, src, #32 subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f + blo 2f 1: cpy_line_vfp d3, 0 cpy_line_vfp d4, 64 @@ -395,7 +395,7 @@ L(cpy_body_long): /* Count in tmp2. */ add dst, dst, #2 * 64 add src, src, #2 * 64 subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b + bhs 1b 2: cpy_tail_vfp d3, 0 @@ -499,15 +499,15 @@ L(cpy_notaligned): 1: pld [src, #(3 * 64)] subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi L(tail63unaligned) + ldrlo tmp2, [sp], #FRAME_SIZE + blo L(tail63unaligned) pld [src, #(4 * 64)] #ifdef USE_NEON vld1.8 {d0-d3}, [src]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bmi 2f + blo 2f 1: pld [src, #(4 * 64)] vst1.8 {d0-d3}, [ALIGN (dst, 64)]! @@ -515,7 +515,7 @@ L(cpy_notaligned): vst1.8 {d4-d7}, [ALIGN (dst, 64)]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bpl 1b + bhs 1b 2: vst1.8 {d0-d3}, [ALIGN (dst, 64)]! vst1.8 {d4-d7}, [ALIGN (dst, 64)]! diff --git a/string/arm/memset.S b/string/arm/memset.S index 3ee5238..11e9273 100644 --- a/string/arm/memset.S +++ b/string/arm/memset.S @@ -1,7 +1,7 @@ /* * memset - fill memory with a constant * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -25,7 +25,6 @@ #else #define CHARTSTMASK(c) 1<<(c*8) #endif - .text .thumb @ --------------------------------------------------------------------------- diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S index 3e54519..b75d414 100644 --- a/string/arm/strcmp-armv6m.S +++ b/string/arm/strcmp-armv6m.S @@ -1,7 +1,7 @@ /* * strcmp for ARMv6-M (optimized for performance, not size) * - * Copyright (c) 2014-2019, Arm Limited. + * Copyright (c) 2014-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S index 586c14d..51443e3 100644 --- a/string/arm/strcmp.S +++ b/string/arm/strcmp.S @@ -1,7 +1,7 @@ /* * strcmp for ARMv7 * - * Copyright (c) 2012-2019, Arm Limited. + * Copyright (c) 2012-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -11,7 +11,7 @@ available. Use ldrd to support wider loads, provided the data is sufficiently aligned. Use saturating arithmetic to optimize the compares. */ -#define __strcmp_arm strcmp + #include "../asmdefs.h" /* Build Options: @@ -125,7 +125,6 @@ #endif .endm - .text .p2align 5 L(strcmp_start_addr): #if STRCMP_NO_PRECHECK == 0 diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c index 2554810..02cf94f 100644 --- a/string/arm/strcpy.c +++ b/string/arm/strcpy.c @@ -1,10 +1,11 @@ /* * strcpy * - * Copyright (c) 2008-2019, Arm Limited. + * Copyright (c) 2008-2020, Arm Limited. * SPDX-License-Identifier: MIT */ +#if defined (__thumb2__) && !defined (__thumb__) /* For GLIBC: #include @@ -12,7 +13,7 @@ #undef strcmp */ -#define __strcpy_arm strcpy + #ifdef __thumb2__ #define magic1(REG) "#0x01010101" #define magic2(REG) "#0x80808080" @@ -111,13 +112,8 @@ __strcpy_arm (char* dst, const char* src) # else "tst r2, #0xff\n\t" "itet ne\n\t" -# ifdef __clang__ - "strhne r2, [ip], #2\n\t" - "strbeq r2, [ip]\n\t" -# else "strneh r2, [ip], #2\n\t" "streqb r2, [ip]\n\t" -# endif "tstne r2, #0xff00\n\t" # endif "bne 5b\n\t" @@ -133,3 +129,5 @@ __strcpy_arm (char* dst, const char* src) "BX LR"); } /* For GLIBC: libc_hidden_builtin_def (strcpy) */ + +#endif /* defined (__thumb2__) && !defined (__thumb__) */ diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S index 046148a..5ad30c9 100644 --- a/string/arm/strlen-armv6t2.S +++ b/string/arm/strlen-armv6t2.S @@ -1,7 +1,7 @@ /* * strlen - calculate the length of a string * - * Copyright (c) 2010, Arm Limited. + * Copyright (c) 2010-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -12,7 +12,7 @@ ARMv6T2, AArch32 */ -#define __strlen_armv6t2 strlen + #include "../asmdefs.h" #ifdef __ARMEB__ diff --git a/string/asmdefs.h b/string/asmdefs.h index 31c0f9d..340b427 100644 --- a/string/asmdefs.h +++ b/string/asmdefs.h @@ -1,7 +1,7 @@ /* * Macros for asm code. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -81,4 +81,18 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #define L(l) .L ## l +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + #endif diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c index 967507b..d5d4ea7 100644 --- a/string/bench/memcpy.c +++ b/string/bench/memcpy.c @@ -221,6 +221,40 @@ int main (void) printf ("\n"); } + printf ("\nUnaligned forwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + + printf ("\nUnaligned backwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } return 0; } diff --git a/string/bench/strlen.c b/string/bench/strlen.c new file mode 100644 index 0000000..cc0f04b --- /dev/null +++ b/string/bench/strlen.c @@ -0,0 +1,221 @@ +/* + * strlen benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 2000 +#define ITERS2 20000000 +#define ITERS3 2000000 +#define NUM_STRLEN 16384 + +#define MAX_ALIGN 32 +#define MAX_STRLEN 256 + +static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s); + int test_mte; +} funtab[] = { + // clang-format off + F(strlen, 0) +#if __aarch64__ + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strlen_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + F(__strlen_armv6t2, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +static uint16_t strlen_tests[NUM_STRLEN]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM - 1) +static uint8_t strlen_len_arr[SIZE_NUM]; + +/* Frequency data for strlen sizes up to 128 based on SPEC2017. */ +static freq_data_t strlen_len_freq[] = +{ + { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115}, + { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418}, + { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79}, + { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21}, + { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9}, + { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5}, + { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2}, + { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1}, + { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1}, + { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1}, + { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1}, + {107, 1}, { 0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM - 1) +static uint8_t strlen_align_arr[ALIGN_NUM]; + +/* Alignment data for strlen based on SPEC2017. */ +static align_data_t string_align_freq[] = +{ + {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0} +}; + +static void +init_strlen_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++) + for (j = 0, size = strlen_len_freq[i].size; j < freq; j++) + strlen_len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++) + for (j = 0, size = string_align_freq[i].align; j < freq; j++) + strlen_align_arr[n++] = size; + assert (n == ALIGN_NUM); +} + +static void +init_strlen_tests (void) +{ + uint16_t index[MAX_ALIGN]; + + memset (a, 'x', sizeof (a)); + + /* Create indices for strings at all alignments. */ + for (int i = 0; i < MAX_ALIGN; i++) + { + index[i] = i * (MAX_STRLEN + 1); + a[index[i] + MAX_STRLEN] = 0; + } + + /* Create a random set of strlen input strings using the string length + and alignment distributions. */ + for (int n = 0; n < NUM_STRLEN; n++) + { + int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; + int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; + + strlen_tests[n] = + index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + } +} + +static volatile size_t maskv = 0; + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + printf ("\nRandom strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t res = 0, strlen_size = 0, mask = maskv; + printf ("%22s ", funtab[f].name); + + for (int c = 0; c < NUM_STRLEN; c++) + strlen_size += funtab[f].fun (a + strlen_tests[c]); + strlen_size *= ITERS; + + /* Measure latency of strlen result with (res & mask). */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_STRLEN; c++) + res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("%.2f\n", (double)strlen_size / t); + } + + printf ("\nSmall aligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nSmall unaligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + int align = 9; + for (int size = 1; size <= 64; size *= 2) + { + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a + align); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nMedium strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 128; size <= 4096; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\n"); + + return 0; +} diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 67b0dbf..378c3cd 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -1,7 +1,7 @@ /* * Public API. * - * Copyright (c) 2019-2020, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -54,12 +54,11 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if __ARM_FEATURE_MEMORY_TAGGING +void *__mtag_tag_region (void *, size_t); +void *__mtag_tag_zero_region (void *, size_t); +# endif #elif __arm__ -#define __memcpy_arm memcpy -#define __memchr_arm memchr -#define __strcpy_arm strcpy -#define __strcmp_arm strcmp -#define __strlen_armv6t2 strlen void *__memcpy_arm (void *__restrict, const void *__restrict, size_t); void *__memset_arm (void *, int, size_t); void *__memchr_arm (const void *, int, size_t); diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c new file mode 100644 index 0000000..d8c02d9 --- /dev/null +++ b/string/test/__mtag_tag_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include +#include +#include +#include +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a'; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 'a') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c new file mode 100644 index 0000000..221c223 --- /dev/null +++ b/string/test/__mtag_tag_zero_region.c @@ -0,0 +1,147 @@ +/* + * __mtag_tag_zero_region test. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST +#include +#include +#include +#include +#include "mte.h" +#include "stringlib.h" +#include "stringtest.h" + +static void +mtag_quoteat (const char *prefix, void *p, int len, int at) +{ + /* Print tag, untag and quote the context. */ + printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at)); + untag_buffer (p, len, 1); + p = untag_pointer (p); + quoteat (prefix, p, len, at); +} + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun) (void *s, size_t n); +} funtab[] = { +// clang-format off +#if __aarch64__ + F(__mtag_tag_zero_region) +#endif + {0, 0} + // clang-format on +}; +#undef F + +#define A 64 +#define LEN 250000 +static unsigned char *sbuf; + +static void * +alignup (void *p) +{ + return (void *) (((uintptr_t) p + A - 1) & -A); +} + +static void +test (const struct fun *fun, int salign, int len) +{ + unsigned char *src = alignup (sbuf); + unsigned char *s = src + salign; + void *p; + int i; + + if (err_count >= ERR_LIMIT) + return; + if (len > LEN || salign >= A) + abort (); + for (i = 0; i < len + 2 * A; i++) + src[i] = '?'; + for (i = 0; i < len; i++) + s[i] = 'a' + i % 23; + + src = tag_buffer (src, len + 2 * A, 1); + s = src + salign; + /* Use different tag. */ + s = __arm_mte_increment_tag (s, 1); + p = fun->fun (s, len); + + if (p != s) + ERR ("%s(%p,..) returned %p\n", fun->name, s, p); + + for (i = 0; i < salign; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got head", src, len + 2 * A, i); + return; + } + } + + for (; i < salign + len; i++) + { + if (s[i - salign] != 0) + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got body", src, len + 2 * A, i); + return; + } + } + + for (; i < len + 2 * A; i++) + { + if (src[i] != '?') + { + ERR ("%s(align %d, %d) failed\n", fun->name, salign, len); + mtag_quoteat ("got tail", src, len + 2 * A, i); + return; + } + } + + untag_buffer (src, len + 2 * A, 1); +} + +int +main () +{ + if (!mte_enabled ()) + return 0; + + sbuf = mte_mmap (LEN + 3 * A); + int r = 0; + for (int i = 0; funtab[i].name; i++) + { + err_count = 0; + for (int s = 0; s < A; s += 16) + { + int n; + for (n = 0; n < 200; n += 16) + { + test (funtab + i, s, n); + } + for (; n < LEN; n *= 2) + { + test (funtab + i, s, n); + } + } + printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name); + if (err_count) + r = -1; + } + return r; +} +#else +int +main () +{ + return 0; +} +#endif diff --git a/string/test/memcmp.c b/string/test/memcmp.c index dd93698..7a7cf9c 100644 --- a/string/test/memcmp.c +++ b/string/test/memcmp.c @@ -1,7 +1,7 @@ /* * memcmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/memcpy.c b/string/test/memcpy.c index 346d920..ce0ceee 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -1,7 +1,7 @@ /* * memcpy test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/memmove.c b/string/test/memmove.c index af92fe3..689b68c 100644 --- a/string/test/memmove.c +++ b/string/test/memmove.c @@ -1,7 +1,7 @@ /* * memmove test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/memset.c b/string/test/memset.c index cebe9ad..f172144 100644 --- a/string/test/memset.c +++ b/string/test/memset.c @@ -1,7 +1,7 @@ /* * memset test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/strcmp.c b/string/test/strcmp.c index 4e718e3..d57b54e 100644 --- a/string/test/strcmp.c +++ b/string/test/strcmp.c @@ -1,7 +1,7 @@ /* * strcmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/strncmp.c b/string/test/strncmp.c index 23fbb0a..018a8a4 100644 --- a/string/test/strncmp.c +++ b/string/test/strncmp.c @@ -1,7 +1,7 @@ /* * strncmp test. * - * Copyright (c) 2019, Arm Limited. + * Copyright (c) 2019-2020, Arm Limited. * SPDX-License-Identifier: MIT */ diff --git a/string/test/strrchr.c b/string/test/strrchr.c index b968457..fedbdc5 100644 --- a/string/test/strrchr.c +++ b/string/test/strrchr.c @@ -1,7 +1,7 @@ /* * strrchr test. * - * Copyright (c) 2019-2020, Arm Limited. + * Copyright (c) 2019-2021, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -91,7 +91,7 @@ test (const struct fun *fun, int align, int seekpos, int len) if (p != s + len) { ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n", - fun->name, s, 0, len, p, f, len); + fun->name, s, 0, len, p, s + len, len); quote ("input", s, len); } } -- Gitee