From 6288c8579ce43fdf9b48df9271e652572334e83e Mon Sep 17 00:00:00 2001
From: Caoruihong <crh.cao@huawei.com>
Date: Thu, 26 Aug 2021 12:28:36 +0800
Subject: [PATCH] feat: upgrade to v21.02

Signed-off-by: Caoruihong <crh.cao@huawei.com>
Change-Id: I4405c75294b42f93414f64dccc50d7242abe4fdd
---
 Makefile                                 |   2 +-
 README                                   |   2 +-
 README.OpenSource                        |   4 +-
 config.mk.dist                           |   2 +-
 math/cosf.c                              |   2 +-
 math/erf.c                               | 244 ++++++++++++++++++++
 math/erf_data.c                          |  85 +++++++
 math/erff.c                              | 104 +++++++++
 math/erff_data.c                         |  22 ++
 math/exp.c                               |   2 +-
 math/exp2.c                              |   2 +-
 math/expf.c                              |   2 +-
 math/include/mathlib.h                   |   2 +-
 math/log.c                               |   2 +-
 math/log2.c                              |   2 +-
 math/logf.c                              |   2 +-
 math/logf_data.c                         |   2 +-
 math/math_config.h                       |  45 +++-
 math/math_errf.c                         |  16 +-
 math/pow.c                               |   2 +-
 math/powf.c                              |   2 +-
 math/powf_log2_data.c                    |   2 +-
 math/sincosf.c                           |   2 +-
 math/sincosf_data.c                      |   2 +-
 math/sinf.c                              |   2 +-
 math/test/mathbench.c                    |   4 +-
 math/test/mathtest.c                     |   2 +-
 math/test/rtest/dotest.c                 |   2 +-
 math/test/rtest/intern.h                 |   2 +-
 math/test/rtest/main.c                   |   2 +-
 math/test/rtest/random.c                 |   2 +-
 math/test/rtest/random.h                 |   2 +-
 math/test/rtest/semi.c                   |   2 +-
 math/test/rtest/semi.h                   |   2 +-
 math/test/rtest/types.h                  |   2 +-
 math/test/rtest/wrappers.c               |   2 +-
 math/test/rtest/wrappers.h               |   2 +-
 math/test/runulp.sh                      |  23 +-
 math/test/testcases/directed/cosf.tst    |   2 +-
 math/test/testcases/directed/erf.tst     |  17 ++
 math/test/testcases/directed/erff.tst    |  17 ++
 math/test/testcases/directed/exp.tst     |   2 +-
 math/test/testcases/directed/exp2.tst    |   2 +-
 math/test/testcases/directed/exp2f.tst   |   2 +-
 math/test/testcases/directed/expf.tst    |   2 +-
 math/test/testcases/directed/log.tst     |   2 +-
 math/test/testcases/directed/log2.tst    |   2 +-
 math/test/testcases/directed/log2f.tst   |   2 +-
 math/test/testcases/directed/logf.tst    |   2 +-
 math/test/testcases/directed/pow.tst     |   2 +-
 math/test/testcases/directed/powf.tst    |   2 +-
 math/test/testcases/directed/sincosf.tst |   2 +-
 math/test/testcases/directed/sinf.tst    |   2 +-
 math/test/testcases/random/double.tst    |   2 +-
 math/test/testcases/random/float.tst     |   2 +-
 math/test/ulp.c                          |   4 +-
 math/tools/plot.py                       |   0
 math/tools/remez.jl                      |   2 +-
 math/v_math.h                            |   2 +-
 networking/test/chksum.c                 |   2 +-
 string/Dir.mk                            |   9 +-
 string/aarch64/__mtag_tag_region.S       | 100 +++++++++
 string/aarch64/__mtag_tag_zero_region.S  | 100 +++++++++
 string/aarch64/memchr-mte.S              |   2 +
 string/aarch64/memchr-sve.S              |  11 +-
 string/aarch64/memchr.S                  |   4 +-
 string/aarch64/memcmp-sve.S              |  10 +-
 string/aarch64/memcmp.S                  |   5 +-
 string/aarch64/memcpy-advsimd.S          |  10 +-
 string/aarch64/memcpy.S                  |   3 +
 string/aarch64/memrchr.S                 |   1 +
 string/aarch64/memset.S                  |   6 +-
 string/aarch64/strchr-mte.S              |   1 +
 string/aarch64/strchr-sve.S              |   8 +-
 string/aarch64/strchr.S                  |   3 +-
 string/aarch64/strchrnul-mte.S           |   1 +
 string/aarch64/strchrnul-sve.S           |   2 +-
 string/aarch64/strchrnul.S               |   3 +-
 string/aarch64/strcmp-mte.S              |  18 +-
 string/aarch64/strcmp-sve.S              |  11 +-
 string/aarch64/strcmp.S                  |   2 +
 string/aarch64/strcpy-mte.S              |   2 +
 string/aarch64/strcpy-sve.S              |   9 +-
 string/aarch64/strcpy.S                  |   4 +-
 string/aarch64/strlen-mte.S              |   1 +
 string/aarch64/strlen-sve.S              |  11 +-
 string/aarch64/strlen.S                  | 275 +++++++++++------------
 string/aarch64/strncmp-mte.S             |  24 +-
 string/aarch64/strncmp-sve.S             |  10 +-
 string/aarch64/strncmp.S                 |  18 +-
 string/aarch64/strnlen-sve.S             |  11 +-
 string/aarch64/strnlen.S                 |   2 +
 string/aarch64/strrchr-mte.S             |   1 +
 string/aarch64/strrchr-sve.S             |   8 +-
 string/aarch64/strrchr.S                 |   1 +
 string/arm/memchr.S                      |   4 +-
 string/arm/memcpy.S                      |  26 +--
 string/arm/memset.S                      |   3 +-
 string/arm/strcmp-armv6m.S               |   2 +-
 string/arm/strcmp.S                      |   5 +-
 string/arm/strcpy.c                      |  12 +-
 string/arm/strlen-armv6t2.S              |   4 +-
 string/asmdefs.h                         |  16 +-
 string/bench/memcpy.c                    |  34 +++
 string/bench/strlen.c                    | 221 ++++++++++++++++++
 string/include/stringlib.h               |  11 +-
 string/test/__mtag_tag_region.c          | 147 ++++++++++++
 string/test/__mtag_tag_zero_region.c     | 147 ++++++++++++
 string/test/memcmp.c                     |   2 +-
 string/test/memcpy.c                     |   2 +-
 string/test/memmove.c                    |   2 +-
 string/test/memset.c                     |   2 +-
 string/test/strcmp.c                     |   2 +-
 string/test/strncmp.c                    |   2 +-
 string/test/strrchr.c                    |   4 +-
 115 files changed, 1680 insertions(+), 334 deletions(-)
 create mode 100644 math/erf.c
 create mode 100644 math/erf_data.c
 create mode 100644 math/erff.c
 create mode 100644 math/erff_data.c
 mode change 100644 => 100755 math/test/runulp.sh
 create mode 100644 math/test/testcases/directed/erf.tst
 create mode 100644 math/test/testcases/directed/erff.tst
 mode change 100644 => 100755 math/tools/plot.py
 mode change 100644 => 100755 math/tools/remez.jl
 create mode 100644 string/aarch64/__mtag_tag_region.S
 create mode 100644 string/aarch64/__mtag_tag_zero_region.S
 create mode 100644 string/bench/strlen.c
 create mode 100644 string/test/__mtag_tag_region.c
 create mode 100644 string/test/__mtag_tag_zero_region.c

diff --git a/Makefile b/Makefile
index 89fc13b..169f89e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 srcdir = .
diff --git a/README b/README
index 6cde5cd..9e1a34f 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
 to projects that require copyright assignment is possible.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.05.
+release is v21.02.
 
 Source code layout:
 
diff --git a/README.OpenSource b/README.OpenSource
index 45a839b..0e874ba 100644
--- a/README.OpenSource
+++ b/README.OpenSource
@@ -3,9 +3,9 @@
         "Name"                  : "optimized-routines",
         "License"               : "MIT License",
         "License File"          : "LICENSE",
-        "Version Number"        : "v20.05",
+        "Version Number"        : "v21.02",
         "Owner"                 : "zhaotianyu9@huawei.com",
-        "Upstream URL"          : "https://www.mirbsd./mksh.ht://www.arm.com/;https://github.com/ARM-software/optimized-routines",
+        "Upstream URL"          : "https://github.com/ARM-software/optimized-routines",
         "Description"           : "Optimized implementations of various library functions for ARM architecture processors"
     }
 ]
diff --git a/config.mk.dist b/config.mk.dist
index 3e55c98..177e1ac 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,6 +1,6 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 # Subprojects to build
diff --git a/math/cosf.c b/math/cosf.c
index 831b39e..f29f194 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/erf.c b/math/erf.c
new file mode 100644
index 0000000..12d7e51
--- /dev/null
+++ b/math/erf.c
@@ -0,0 +1,244 @@
+/*
+ * Double-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+#include <math.h>
+#include <stdint.h>
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
+#define C 0x1.b0ac16p-1
+#define PA __erf_data.erf_poly_A
+#define NA __erf_data.erf_ratio_N_A
+#define DA __erf_data.erf_ratio_D_A
+#define NB __erf_data.erf_ratio_N_B
+#define DB __erf_data.erf_ratio_D_B
+#define PC __erf_data.erfc_poly_C
+#define PD __erf_data.erfc_poly_D
+#define PE __erf_data.erfc_poly_E
+#define PF __erf_data.erfc_poly_F
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erf implementation using a mix of
+   rational and polynomial approximations.
+   Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  */
+double
+erf (double x)
+{
+  /* Get top word and sign.  */
+  uint32_t ix = top32 (x);
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Normalized and subnormal cases */
+  if (ia < 0x3feb0000)
+    { /* a = |x| < 0.84375.  */
+
+      if (ia < 0x3e300000)
+	{ /* a < 2^(-28).  */
+	  if (ia < 0x00800000)
+	    { /* a < 2^(-1015).  */
+	      double y =  fma (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflow (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      double x2 = x * x;
+
+      if (ia < 0x3fe00000)
+	{ /* a < 0.5  - Use polynomial approximation.  */
+	  double r1 = fma (x2, PA[1], PA[0]);
+	  double r2 = fma (x2, PA[3], PA[2]);
+	  double r3 = fma (x2, PA[5], PA[4]);
+	  double r4 = fma (x2, PA[7], PA[6]);
+	  double r5 = fma (x2, PA[9], PA[8]);
+	  double x4 = x2 * x2;
+	  double r = r5;
+	  r = fma (x4, r, r4);
+	  r = fma (x4, r, r3);
+	  r = fma (x4, r, r2);
+	  r = fma (x4, r, r1);
+	  return fma (r, x, x); /* This fma is crucial for accuracy.  */
+	}
+      else
+	{ /* 0.5 <= a < 0.84375 - Use rational approximation.  */
+	  double x4, x8, r1n, r2n, r1d, r2d, r3d;
+
+	  r1n = fma (x2, NA[1], NA[0]);
+	  x4 = x2 * x2;
+	  r2n = fma (x2, NA[3], NA[2]);
+	  x8 = x4 * x4;
+	  r1d = fma (x2, DA[0], 1.0);
+	  r2d = fma (x2, DA[2], DA[1]);
+	  r3d = fma (x2, DA[4], DA[3]);
+	  double P = r1n + x4 * r2n + x8 * NA[4];
+	  double Q = r1d + x4 * r2d + x8 * r3d;
+	  return fma (P / Q, x, x);
+	}
+    }
+  else if (ia < 0x3ff40000)
+    { /* 0.84375 <= |x| < 1.25.  */
+      double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
+      double a = fabs (x) - 1.0;
+      r1n = fma (a, NB[1], NB[0]);
+      a2 = a * a;
+      r1d = fma (a, DB[0], 1.0);
+      a4 = a2 * a2;
+      r2n = fma (a, NB[3], NB[2]);
+      a6 = a4 * a2;
+      r2d = fma (a, DB[2], DB[1]);
+      r3n = fma (a, NB[5], NB[4]);
+      r3d = fma (a, DB[4], DB[3]);
+      r4n = NB[6];
+      r4d = DB[5];
+      double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
+      double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
+      if (sign)
+	return -C - P / Q;
+      else
+	return C + P / Q;
+    }
+  else if (ia < 0x40000000)
+    { /* 1.25 <= |x| < 2.0.  */
+      double a = fabs (x);
+      a = a - 1.25;
+
+      double r1 = fma (a, PC[1], PC[0]);
+      double r2 = fma (a, PC[3], PC[2]);
+      double r3 = fma (a, PC[5], PC[4]);
+      double r4 = fma (a, PC[7], PC[6]);
+      double r5 = fma (a, PC[9], PC[8]);
+      double r6 = fma (a, PC[11], PC[10]);
+      double r7 = fma (a, PC[13], PC[12]);
+      double r8 = fma (a, PC[15], PC[14]);
+
+      double a2 = a * a;
+
+      double r = r8;
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x400a0000)
+    { /* 2 <= |x| < 3.25.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -1.0);
+
+      double r1 = fma (a, PD[1], PD[0]);
+      double r2 = fma (a, PD[3], PD[2]);
+      double r3 = fma (a, PD[5], PD[4]);
+      double r4 = fma (a, PD[7], PD[6]);
+      double r5 = fma (a, PD[9], PD[8]);
+      double r6 = fma (a, PD[11], PD[10]);
+      double r7 = fma (a, PD[13], PD[12]);
+      double r8 = fma (a, PD[15], PD[14]);
+      double r9 = fma (a, PD[17], PD[16]);
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x40100000)
+    { /* 3.25 <= |x| < 4.0.  */
+      double a = fabs (x);
+      a = a - 3.25;
+
+      double r1 = fma (a, PE[1], PE[0]);
+      double r2 = fma (a, PE[3], PE[2]);
+      double r3 = fma (a, PE[5], PE[4]);
+      double r4 = fma (a, PE[7], PE[6]);
+      double r5 = fma (a, PE[9], PE[8]);
+      double r6 = fma (a, PE[11], PE[10]);
+      double r7 = fma (a, PE[13], PE[12]);
+
+      double a2 = a * a;
+
+      double r = r7;
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x4017a000)
+    { /* 4 <= |x| < 5.90625.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -2.0);
+
+      double r1 = fma (a, PF[1], PF[0]);
+      double r2 = fma (a, PF[3], PF[2]);
+      double r3 = fma (a, PF[5], PF[4]);
+      double r4 = fma (a, PF[7], PF[6]);
+      double r5 = fma (a, PF[9], PF[8]);
+      double r6 = fma (a, PF[11], PF[10]);
+      double r7 = fma (a, PF[13], PF[12]);
+      double r8 = fma (a, PF[15], PF[14]);
+      double r9 = PF[16];
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else
+    {
+      /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1.  */
+      if (unlikely (ia >= 0x7ff00000))
+	return (double) (1.0 - (sign << 1)) + 1.0 / x;
+
+      if (sign)
+	return -1.0;
+      else
+	return 1.0;
+    }
+}
diff --git a/math/erf_data.c b/math/erf_data.c
new file mode 100644
index 0000000..807875b
--- /dev/null
+++ b/math/erf_data.c
@@ -0,0 +1,85 @@
+/*
+ * Shared data between erf and erfc.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/*
+Minimax approximation of erf
+*/
+const struct erf_data __erf_data = {
+.erf_poly_A = {
+#if ERF_POLY_A_NCOEFFS == 10
+0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
+-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
+0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
+-0x1.18c47fd143c5ep-23
+#endif
+},
+/* Rational approximation on [0x1p-28, 0.84375] */
+.erf_ratio_N_A = {
+0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
+-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
+},
+.erf_ratio_D_A = {
+0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
+0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
+},
+/* Rational approximation on [0.84375, 1.25] */
+.erf_ratio_N_B = {
+-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
+0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
+-0x1.1bf380a96073fp-9
+},
+.erf_ratio_D_B = {
+0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
+0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
+},
+.erfc_poly_C = {
+#if ERFC_POLY_C_NCOEFFS == 16
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
+0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
+-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
+-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
+-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
+0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
+-0x1.578c9e375d37p-19
+#endif
+},
+.erfc_poly_D = {
+#if ERFC_POLY_D_NCOEFFS == 18
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
+0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
+-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
+-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
+0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
+-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
+0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
+#endif
+},
+.erfc_poly_E = {
+#if ERFC_POLY_E_NCOEFFS == 14
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
+0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
+-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
+0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
+-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
+-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
+#endif
+},
+.erfc_poly_F = {
+#if ERFC_POLY_F_NCOEFFS == 17
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
+0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
+-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
+0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
+-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
+0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
+-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
+#endif
+}
+};
+
diff --git a/math/erff.c b/math/erff.c
new file mode 100644
index 0000000..a58e825
--- /dev/null
+++ b/math/erff.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "math_config.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff
+   using either a pure polynomial approximation or
+   the exponential of a polynomial.
+   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
+float
+erff (float x)
+{
+  float r, x2, u;
+
+  /* Get top word.  */
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Limit of both intervals is 0.875 for performance reasons but coefficients
+     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+     from 0.94 to 1.1ulps.  */
+  if (ia12 < 0x3f6)
+    { /* a = |x| < 0.875.  */
+
+      /* Tiny and subnormal cases.  */
+      if (unlikely (ia12 < 0x318))
+	{ /* |x| < 2^(-28).  */
+	  if (unlikely (ia12 < 0x040))
+	    { /* |x| < 2^(-119).  */
+	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflowf (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      x2 = x * x;
+
+      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
+      r = A[5];
+      r = fmaf (r, x2, A[4]);
+      r = fmaf (r, x2, A[3]);
+      r = fmaf (r, x2, A[2]);
+      r = fmaf (r, x2, A[1]);
+      r = fmaf (r, x2, A[0]);
+      r = fmaf (r, x, x);
+    }
+  else if (ia12 < 0x408)
+    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
+
+      float a = fabsf (x);
+      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
+      r = fmaf (B[6], a, B[5]);
+      u = fmaf (B[4], a, B[3]);
+      x2 = x * x;
+      r = fmaf (r, x2, u);
+      /* Then switch to pure Horner scheme.  */
+      r = fmaf (r, a, B[2]);
+      r = fmaf (r, a, B[1]);
+      r = fmaf (r, a, B[0]);
+      r = fmaf (r, a, a);
+      /* Single precision exponential with ~0.5ulps,
+	 ensures erff has max. rel. error
+	 < 1ulp on [0.921875, 4.0],
+	 < 1.1ulps on [0.875, 4.0].  */
+      r = expf (-r);
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f + r;
+      else
+	r = 1.0f - r;
+    }
+  else
+    { /* |x| >= 4.0.  */
+
+      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
+      if (unlikely (ia12 >= 0x7f8))
+	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f;
+      else
+	r = 1.0f;
+    }
+  return r;
+}
diff --git a/math/erff_data.c b/math/erff_data.c
new file mode 100644
index 0000000..fa6b1ef
--- /dev/null
+++ b/math/erff_data.c
@@ -0,0 +1,22 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff. */
+const struct erff_data __erff_data = {
+.erff_poly_A = {
+0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
+},
+.erff_poly_B = {
+0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
+-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
+0x1.222900p-16f
+}
+};
+
diff --git a/math/exp.c b/math/exp.c
index 1909b8e..7f5024c 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision e^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/exp2.c b/math/exp2.c
index 47aa479..35ab39f 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision 2^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/expf.c b/math/expf.c
index 0fe1f7d..9b2f0c3 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision e^x function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 4493008..279d829 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2019, Arm Limited.
+ * Copyright (c) 2015-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log.c b/math/log.c
index b85d3ff..d3b7bc6 100644
--- a/math/log.c
+++ b/math/log.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log2.c b/math/log2.c
index 804fb85..55102b7 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log2(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf.c b/math/logf.c
index ee3120a..cfbaee1 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf_data.c b/math/logf_data.c
index 53c5f62..e8973ce 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for logf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/math_config.h b/math/math_config.h
index 85fc584..e851043 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -298,6 +298,24 @@ check_uflow (double x)
   return WANT_ERRNO ? __math_check_uflow (x) : x;
 }
 
+/* Check if the result overflowed to infinity.  */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0.  */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity.  */
+static inline float
+check_oflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline float
+check_uflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
 
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
@@ -416,4 +434,29 @@ extern const struct pow_log_data
   struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
 } __pow_log_data HIDDEN;
 
+extern const struct erff_data
+{
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
+
+#define ERF_POLY_A_ORDER 19
+#define ERF_POLY_A_NCOEFFS 10
+#define ERFC_POLY_C_NCOEFFS 16
+#define ERFC_POLY_D_NCOEFFS 18
+#define ERFC_POLY_E_NCOEFFS 14
+#define ERFC_POLY_F_NCOEFFS 17
+extern const struct erf_data
+{
+  double erf_poly_A[ERF_POLY_A_NCOEFFS];
+  double erf_ratio_N_A[5];
+  double erf_ratio_D_A[5];
+  double erf_ratio_N_B[7];
+  double erf_ratio_D_B[6];
+  double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
+  double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
+  double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
+  double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
+} __erf_data HIDDEN;
+
 #endif
diff --git a/math/math_errf.c b/math/math_errf.c
index 07154c5..d5350b8 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision math error handling.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -64,3 +64,17 @@ __math_invalidf (float x)
   float y = (x - x) / (x - x);
   return isnan (x) ? y : with_errnof (y, EDOM);
 }
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+  return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+  return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/math/pow.c b/math/pow.c
index ced7c4f..86842c6 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision x^y function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf.c b/math/powf.c
index 1534a09..6ba45d3 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision pow function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index b9fbdc4..97e0d98 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for powf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf.c b/math/sincosf.c
index e6cd41e..9746f1c 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index 5d0b58e..ab4ac47 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sinf.c b/math/sinf.c
index 770b294..ddbc1da 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 33ceda3..0c17826 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -248,6 +248,7 @@ D (log2, 0.999, 1.001)
 {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
@@ -275,6 +276,7 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
 #if WANT_VMATH
 D (__s_sin, -3.1, 3.1)
 D (__s_cos, -3.1, 3.1)
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 2ff8c3f..3108967 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index f416477..6be79e1 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -1,7 +1,7 @@
 /*
  * dotest.c - actually generate mathlib test cases
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index af574b0..12a9c74 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -1,7 +1,7 @@
 /*
  * intern.h
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index e94e455..0d8ead8 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -1,7 +1,7 @@
 /*
  * main.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index e97a8c6..5612396 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -1,7 +1,7 @@
 /*
  * random.c - random number generator for producing mathlib test cases
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index c1ce956..b4b22df 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -1,7 +1,7 @@
 /*
  * random.h - header for random.c
  *
- * Copyright (c) 2009-2018, Arm Limited.
+ * Copyright (c) 2009-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index 938dc3a..c9f0daf 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -1,7 +1,7 @@
 /*
  * semi.c: test implementations of mathlib seminumerical functions
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index da473a2..17dc415 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -1,7 +1,7 @@
 /*
  * semi.h: header for semi.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 1a76c2e..53cd557 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -1,7 +1,7 @@
 /*
  * types.h
  *
- * Copyright (c) 2005-2018, Arm Limited.
+ * Copyright (c) 2005-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index acaf671..de45ac5 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -1,7 +1,7 @@
 /*
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 5804935..7b09c85 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -1,7 +1,7 @@
 /*
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
old mode 100644
new mode 100755
index a8c391b..0190d9a
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019, Arm Limited.
+# Copyright (c) 2019-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 #set -x
@@ -72,6 +72,16 @@ t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
+L=1.0
+Ldir=0.9
+t erf  0 0xffff000000000000 10000
+t erf  0x1p-1022  0x1p-26   40000
+t erf  -0x1p-1022 -0x1p-26  40000
+t erf  0x1p-26    0x1p3     40000
+t erf  -0x1p-26  -0x1p3     40000
+t erf  0         inf        40000
+Ldir=0.5
+
 L=0.01
 t expf  0    0xffff0000    10000
 t expf  0x1p-14   0x1p8    50000
@@ -119,6 +129,17 @@ t powf  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
 t powf  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
 t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
+
+L=0.6
+Ldir=0.9
+t erff  0      0xffff0000 10000
+t erff  0x1p-127  0x1p-26 40000
+t erff -0x1p-127 -0x1p-26 40000
+t erff  0x1p-26   0x1p3   40000
+t erff -0x1p-26  -0x1p3   40000
+t erff  0         inf     40000
+Ldir=0.5
+
 done
 
 # vector functions
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 5dc0994..7916044 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,6 +1,6 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
new file mode 100644
index 0000000..7fa4d18
--- /dev/null
+++ b/math/test/testcases/directed/erf.tst
@@ -0,0 +1,17 @@
+; erf.tst - Directed test cases for erf
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
+func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
+func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
+func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
+func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
+func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000..d05b7b1
--- /dev/null
+++ b/math/test/testcases/directed/erff.tst
@@ -0,0 +1,17 @@
+; erff.tst
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index addfc0a..85d556c 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index 04a5a50..fa56c9f 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 2b6a9b5..38cfc3f 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,6 +1,6 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index 74664c7..ff0f671 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,6 +1,6 @@
 ; expf.tst - Directed test cases for expf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index eeb762c..a0aa398 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index e0765d8..ff1286c 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 8d685ba..5832c4f 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,6 +1,6 @@
 ; log2f.tst - Directed test cases for log2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 7ccc873..6e68a36 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,6 +1,6 @@
 ; logf.tst - Directed test cases for logf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index a4c42be..1966581 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for pow
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index efd1dd5..3fa8b11 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,6 +1,6 @@
 ; powf.tst - Directed test cases for powf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index b4b2526..4b33d22 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for SP sincos
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index 13cfdca..ded80b1 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,6 +1,6 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c37e837..c24ff80 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,6 +1,6 @@
 !! double.tst - Random test case specification for DP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test exp 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index baf62b9..d02a227 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,6 +1,6 @@
 !! single.tst - Random test case specification for SP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test sinf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 371567a..51479b8 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,7 +1,7 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -331,11 +331,13 @@ static const struct fun fun[] = {
  F1 (log)
  F1 (log2)
  F2 (pow)
+ F1 (erf)
  D1 (exp)
  D1 (exp2)
  D1 (log)
  D1 (log2)
  D2 (pow)
+ D1 (erf)
 #if WANT_VMATH
  F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
  F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
diff --git a/math/tools/plot.py b/math/tools/plot.py
old mode 100644
new mode 100755
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
old mode 100644
new mode 100755
index f479fc5..2ff436f
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -3,7 +3,7 @@
 
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
-# Copyright (c) 2015-2018, Arm Limited.
+# Copyright (c) 2015-2019, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 import Base.\
diff --git a/math/v_math.h b/math/v_math.h
index 3db22e5..f2cc467 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,7 +1,7 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 50722a4..41b9812 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -1,7 +1,7 @@
 /*
  * Ones' complement checksum test & benchmark
  *
- * Copyright 2016-2020 ARM Limited
+ * Copyright (c) 2016-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/Dir.mk b/string/Dir.mk
index ae7c673..cf3453f 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,6 +1,6 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2020, Arm Limited.
+# Copyright (c) 2019-2021, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 S := $(srcdir)/string
@@ -29,6 +29,8 @@ string-tests := \
 	build/bin/test/memchr \
 	build/bin/test/memrchr \
 	build/bin/test/memcmp \
+	build/bin/test/__mtag_tag_region \
+	build/bin/test/__mtag_tag_zero_region \
 	build/bin/test/strcpy \
 	build/bin/test/stpcpy \
 	build/bin/test/strcmp \
@@ -39,7 +41,9 @@ string-tests := \
 	build/bin/test/strnlen \
 	build/bin/test/strncmp
 
-string-benches := build/bin/bench/memcpy
+string-benches := \
+	build/bin/bench/memcpy \
+	build/bin/bench/strlen
 
 string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
 string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
@@ -95,6 +99,7 @@ check-string: $(string-tests-out)
 	! grep FAIL $^
 
 bench-string: $(string-benches)
+	$(EMULATOR) build/bin/bench/strlen
 	$(EMULATOR) build/bin/bench/memcpy
 
 install-string: \
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
new file mode 100644
index 0000000..84339f7
--- /dev/null
+++ b/string/aarch64/__mtag_tag_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_region - tag memory
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stg	dstin, [dstin]
+	stg	dstin, [tmp]
+	stg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	st2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	st2g	dstin, [dst, 32]
+	st2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_region)
+#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
new file mode 100644
index 0000000..f58364c
--- /dev/null
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_zero_region - tag memory and fill it with zero bytes
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_zero_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stzg	dstin, [dstin]
+	stzg	dstin, [tmp]
+	stzg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gzva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	stz2g	dstin, [dst, 32]
+	stz2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_zero_region)
+#endif
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 31ad050..c2e967d 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -44,6 +44,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 4a5c726..c22e659 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,15 +14,14 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN(__memchr_aarch64_sve, 4)
+ENTRY (__memchr_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
-	nop
 
+	.p2align 4
 0:	whilelo	p1.b, x3, x2			/* make sure off < max */
 	b.none	9f
 
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index dfba79f..353f0d1 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,8 @@
  */
 
 ENTRY (__memchr_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 8a0a2ea..78c5eca 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,10 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__memcmp_aarch64_sve, 4)
+ENTRY (__memcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index dac9147..3b10266 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -28,6 +28,9 @@
 #define tmp2		x8
 
 ENTRY (__memcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 3004179..f97f2c3 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -52,6 +52,9 @@
 
 ENTRY_ALIAS (__memmove_aarch64_simd)
 ENTRY (__memcpy_aarch64_simd)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -179,12 +182,13 @@ L(copy_long_backwards):
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
-	stp	A_q, B_q, [dstend, -32]
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
 	ldp	A_q, B_q, [srcend, -96]
-	stp	C_q, D_q, [dstend, -64]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
 	ldp	C_q, D_q, [srcend, -128]
 	sub	srcend, srcend, 64
-	sub	dstend, dstend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 157bb0d..dd254f6 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -55,6 +55,9 @@
 
 ENTRY_ALIAS (__memmove_aarch64)
 ENTRY (__memcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index ad42b49..7b4be84 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -46,6 +46,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
+	PTR_ARG (0)
 	add	end, srcin, cntin
 	sub	endm1, end, 1
 	bic	src, endm1, 15
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 27743f1..9fcd975 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -22,6 +22,8 @@
 #define zva_val	x5
 
 ENTRY (__memset_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -37,7 +39,7 @@ ENTRY (__memset_aarch64)
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
-	nop
+	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 577752e..dcb0e46 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -43,6 +43,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchr_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 495beda..13ba9f4 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,9 +14,6 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
 #ifdef BUILD_STRCHRNUL
 #define FUNC  __strchrnul_aarch64_sve
@@ -24,7 +21,8 @@
 #define FUNC  __strchr_aarch64_sve
 #endif
 
-ENTRY_ALIGN (FUNC, 4)
+ENTRY (FUNC)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 8d8e3fc..1063cbf 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,7 +1,7 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -51,6 +51,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0xc0300c03 to allow us to identify which lane
 	   matches the requested byte.  Even bits are set if the character
 	   matches, odd bits if either the char is NUL or matches.  */
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 0dbf0dc..1b0d0a6 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -41,6 +41,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 5140e59..428ff1a 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 45be15c..a4230d9 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
index 8f2abc4..12d1a6b 100644
--- a/string/aarch64/strcmp-mte.S
+++ b/string/aarch64/strcmp-mte.S
@@ -51,6 +51,8 @@
 
 
 ENTRY (__strcmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	sub	off2, src2, src1
 	mov	zeroones, REP8_01
 	and	tmp, src1, 7
@@ -99,6 +101,8 @@ L(end):
 	sub	result, data1, data2, lsr 56
 	ret
 
+	.p2align 4
+
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
@@ -127,17 +131,18 @@ L(do_misaligned):
 	b.ne	L(do_misaligned)
 
 L(src1_aligned):
-	lsl	shift, src2, 3
+	neg	shift, src2, lsl 3
 	bic	src2, src2, 7
 	ldr	data3, [src2], 8
 #ifdef __AARCH64EB__
 	rev	data3, data3
 #endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
 	sub	has_nul, data3, zeroones
 	orr	tmp, data3, REP8_7f
-	bic	has_nul, has_nul, tmp
-	lsr	tmp, has_nul, shift
-	cbnz	tmp, L(tail)
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
 
 	sub	off1, src2, src1
 
@@ -156,8 +161,7 @@ L(loop_unaligned):
 	ccmp	data1, data2, 0, eq
 	b.eq	L(loop_unaligned)
 
-	neg	tmp, shift
-	lsl	tmp, has_nul, tmp
+	lsl	tmp, has_nul, shift
 #ifdef __AARCH64EB__
 	rev	tmp, tmp
 #endif
@@ -166,6 +170,7 @@ L(loop_unaligned):
 	cbnz	syndrome, L(end)
 L(tail):
 	ldr	data1, [src1]
+	neg	shift, shift
 	lsr	data2, data3, shift
 	lsr	has_nul, has_nul, shift
 #ifdef __AARCH64EB__
@@ -180,6 +185,5 @@ L(done):
 	sub	result, data1, data2
 	ret
 
-
 END (__strcmp_aarch64_mte)
 
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index dc5b769..e6d2da5 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,16 +14,15 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strcmp_aarch64_sve, 4)
+ENTRY (__strcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p1/z, [x0, x2]
 	ldff1b	z1.b, p1/z, [x1, x2]
 	rdffrs	p0.b, p1/z
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index ee95958..7714ebf 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -37,6 +37,8 @@
 
 	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
index 7c8629e..88c222d 100644
--- a/string/aarch64/strcpy-mte.S
+++ b/string/aarch64/strcpy-mte.S
@@ -55,6 +55,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index a785d45..f515462 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,9 +14,6 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
 #ifdef BUILD_STPCPY
 #define FUNC  __stpcpy_aarch64_sve
@@ -24,7 +21,9 @@
 #define FUNC  __strcpy_aarch64_sve
 #endif
 
-ENTRY_ALIGN (FUNC, 4)
+ENTRY (FUNC)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index a6090c8..6e9ed42 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2019, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -80,6 +80,8 @@
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 6a99340..7cf41d5 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -39,6 +39,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strlen_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 9a9a359..2392493 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,18 +14,15 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strlen_aarch64_sve, 4)
+ENTRY (__strlen_aarch64_sve)
+	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p2/z, [x0, x1]
-	nop
 	rdffrs	p0.b, p2/z
 	b.nlast	2f
 
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 3aa444b..a1b164a 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,84 +1,88 @@
 /*
- * strlen - calculate the length of a string
+ * strlen - calculate the length of a string.
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.	 */
-
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   __strlen_aarch64 will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
+	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
@@ -94,114 +98,103 @@ ENTRY (__strlen_aarch64)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 
-	/* Enter with C = has_nul1 == 0.  */
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-L(page_cross_entry):
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
 	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
+	b.eq	L(loop_entry)
 
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
-#endif
-	sub	len, src, srcin
+	mov	len, 24
 	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
+	mov	tmp3, 16
 	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
+L(loop_entry):
+	bic	src, srcin, 31
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0xf0
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0x0f, lsl 8
 #endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
 
-END (__strlen_aarch64)
+        .p2align 4
 
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
index b7e3914..c9d6fc8 100644
--- a/string/aarch64/strncmp-mte.S
+++ b/string/aarch64/strncmp-mte.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -53,12 +53,10 @@
 #define LS_BK lsl
 #endif
 
-	.text
-	.p2align 6
-	.rep 9
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+ENTRY (__strncmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -70,7 +68,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -83,7 +81,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 L(full_check):
 #ifndef __AARCH64EB__
@@ -167,15 +165,15 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	and	count, count, #0x3f
 	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index fdbe7ae..234190e 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,10 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strncmp_aarch64_sve, 4)
+ENTRY (__strncmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 584c54a..738b653 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -40,12 +40,10 @@
 #define endloop		x15
 #define count		mask
 
-	.text
-	.p2align 6
-	.rep 6
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64, 0)
+ENTRY (__strncmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -60,7 +58,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -73,7 +71,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 	/* Not reached the limit, must have found the end or a diff.  */
 	tbz	limit_wd, #63, L(not_limit)
@@ -178,7 +176,7 @@ L(mutual_align):
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5ad40d3..5b9ebf7 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,9 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
+ENTRY (__strnlen_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -66,7 +65,7 @@ ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
 	b	1b
 
 	/* End of count.  Return max.  */
-9:	mov	x0, x2
+9:	mov	x0, x1
 	ret
 
 END (__strnlen_aarch64_sve)
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 4852edc..48d2495 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -42,6 +42,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strnlen_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 5a409b9..1e4fb1a 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -44,6 +44,7 @@
    if the relevant byte matched the NUL end of string.  */
 
 ENTRY (__strrchr_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	mov	wtmp, 0x3003
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index dbb9bfd..d36d69a 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,8 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strrchr_aarch64_sve, 4)
+ENTRY (__strrchr_aarch64_sve)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index f3d22d4..56185ff 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,6 +55,7 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 565708c..3f1ac4d 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -26,13 +26,11 @@
 	.arch armv7-a
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
-#define __memchr_arm memchr
 #ifdef __ARMEB__
 #define CHARTSTMASK(c) 1<<(31-(c*8))
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 46492b5..86e6493 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -16,8 +16,8 @@
     Unaligned accesses
 
  */
+
 #include "../asmdefs.h"
-#define __memcpy_arm memcpy
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
@@ -124,7 +124,7 @@ ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
-	bge	L(cpy_not_short)
+	bhs	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
@@ -239,10 +239,10 @@ L(cpy_not_short):
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	blt	L(tail63aligned)
+	blo	L(tail63aligned)
 
 	cmp	tmp2, #512
-	bge	L(cpy_body_long)
+	bhs	L(cpy_body_long)
 
 L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
@@ -266,7 +266,7 @@ L(cpy_body_medium):			/* Count in tmp2.  */
 	add	src, src, #64
 	vstr	d1, [dst, #56]
 	add	dst, dst, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	beq	L(done)
 
@@ -312,7 +312,7 @@ L(tail63aligned):			/* Count in tmp2.  */
 	ldrd	A_l, A_h, [src, #64]!
 	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	bne	1f
 	ldr	tmp2,[sp], #FRAME_SIZE
@@ -383,7 +383,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
-	blt	2f
+	blo	2f
 1:
 	cpy_line_vfp	d3, 0
 	cpy_line_vfp	d4, 64
@@ -395,7 +395,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	dst, dst, #2 * 64
 	add	src, src, #2 * 64
 	subs	tmp2, tmp2, #prefetch_lines * 64
-	bge	1b
+	bhs	1b
 
 2:
 	cpy_tail_vfp	d3, 0
@@ -499,15 +499,15 @@ L(cpy_notaligned):
 1:
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
-	ldrmi	tmp2, [sp], #FRAME_SIZE
-	bmi	L(tail63unaligned)
+	ldrlo	tmp2, [sp], #FRAME_SIZE
+	blo	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
 	vld1.8	{d0-d3}, [src]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bmi	2f
+	blo	2f
 1:
 	pld	[src, #(4 * 64)]
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
@@ -515,7 +515,7 @@ L(cpy_notaligned):
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bpl	1b
+	bhs	1b
 2:
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 3ee5238..11e9273 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -25,7 +25,6 @@
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 3e54519..b75d414 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 586c14d..51443e3 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -11,7 +11,7 @@
    available.  Use ldrd to support wider loads, provided the data
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
-#define __strcmp_arm strcmp
+
 #include "../asmdefs.h"
 
 /* Build Options:
@@ -125,7 +125,6 @@
 #endif
 	.endm
 
-	.text
 	.p2align	5
 L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 2554810..02cf94f 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -1,10 +1,11 @@
 /*
  * strcpy
  *
- * Copyright (c) 2008-2019, Arm Limited.
+ * Copyright (c) 2008-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if defined (__thumb2__) && !defined (__thumb__)
 
 /* For GLIBC:
 #include <string.h>
@@ -12,7 +13,7 @@
 
 #undef strcmp
 */
-#define __strcpy_arm strcpy
+
 #ifdef __thumb2__
 #define magic1(REG) "#0x01010101"
 #define magic2(REG) "#0x80808080"
@@ -111,13 +112,8 @@ __strcpy_arm (char* dst, const char* src)
 # else
        "tst	r2, #0xff\n\t"
        "itet	ne\n\t"
-# ifdef __clang__
-       "strhne	r2, [ip], #2\n\t"
-       "strbeq	r2, [ip]\n\t"
-# else
        "strneh	r2, [ip], #2\n\t"
        "streqb	r2, [ip]\n\t"
-# endif
        "tstne	r2, #0xff00\n\t"
 # endif
        "bne	5b\n\t"
@@ -133,3 +129,5 @@ __strcpy_arm (char* dst, const char* src)
        "BX LR");
 }
 /* For GLIBC: libc_hidden_builtin_def (strcpy) */
+
+#endif /* defined (__thumb2__) && !defined (__thumb__)  */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 046148a..5ad30c9 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,7 +1,7 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -12,7 +12,7 @@
    ARMv6T2, AArch32
 
  */
-#define __strlen_armv6t2 strlen
+
 #include "../asmdefs.h"
 
 #ifdef __ARMEB__
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 31c0f9d..340b427 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -1,7 +1,7 @@
 /*
  * Macros for asm code.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -81,4 +81,18 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 
 #define L(l) .L ## l
 
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
 #endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 967507b..d5d4ea7 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -221,6 +221,40 @@ int main (void)
       printf ("\n");
     }
 
+  printf ("\nUnaligned forwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, a + 256 + (i & 31), size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+
+  printf ("\nUnaligned backwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a + 256 + (i & 31), a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
 
   return 0;
 }
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 0000000..cc0f04b
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strlen, 0)
+#if __aarch64__
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+  F(__strlen_armv6t2, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017.  */
+static freq_data_t strlen_len_freq[] =
+{
+  { 12,22671}, { 18,12834}, { 13, 9555}, {  6, 6348}, { 17, 6095}, { 11, 2115},
+  { 10, 1335}, {  7,  814}, {  2,  646}, {  9,  483}, {  8,  471}, { 16,  418},
+  {  4,  390}, {  1,  388}, {  5,  233}, {  3,  204}, {  0,   79}, { 14,   79},
+  { 15,   69}, { 26,   36}, { 22,   35}, { 31,   24}, { 32,   24}, { 19,   21},
+  { 25,   17}, { 28,   15}, { 21,   14}, { 33,   14}, { 20,   13}, { 24,    9},
+  { 29,    9}, { 30,    9}, { 23,    7}, { 34,    7}, { 27,    6}, { 44,    5},
+  { 42,    4}, { 45,    3}, { 47,    3}, { 40,    2}, { 41,    2}, { 43,    2},
+  { 58,    2}, { 78,    2}, { 36,    2}, { 48,    1}, { 52,    1}, { 60,    1},
+  { 64,    1}, { 56,    1}, { 76,    1}, { 68,    1}, { 80,    1}, { 84,    1},
+  { 72,    1}, { 86,    1}, { 35,    1}, { 39,    1}, { 50,    1}, { 38,    1},
+  { 37,    1}, { 46,    1}, { 98,    1}, {102,    1}, {128,    1}, { 51,    1},
+  {107,    1}, { 0,     0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017.  */
+static align_data_t string_align_freq[] =
+{
+  {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+      strlen_len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+      strlen_align_arr[n++] = size;
+  assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+  uint16_t index[MAX_ALIGN];
+
+  memset (a, 'x', sizeof (a));
+
+  /* Create indices for strings at all alignments.  */
+  for (int i = 0; i < MAX_ALIGN; i++)
+    {
+      index[i] = i * (MAX_STRLEN + 1);
+      a[index[i] + MAX_STRLEN] = 0;
+    }
+
+  /* Create a random set of strlen input strings using the string length
+     and alignment distributions.  */
+  for (int n = 0; n < NUM_STRLEN; n++)
+    {
+      int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+      int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+      strlen_tests[n] =
+	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+    }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+  rand32 (0x12345678);
+  init_strlen_distribution ();
+  init_strlen_tests ();
+
+  printf ("\nRandom strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t res = 0, strlen_size = 0, mask = maskv;
+      printf ("%22s ", funtab[f].name);
+
+      for (int c = 0; c < NUM_STRLEN; c++)
+	strlen_size += funtab[f].fun (a + strlen_tests[c]);
+      strlen_size *= ITERS;
+
+      /* Measure latency of strlen result with (res & mask).  */
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_STRLEN; c++)
+	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+      t = clock_get_ns () - t;
+      printf ("%.2f\n", (double)strlen_size / t);
+    }
+
+  printf ("\nSmall aligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nSmall unaligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      int align = 9;
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a + align, 'x', size);
+	  a[align + size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a + align);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nMedium strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 128; size <= 4096; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\n");
+
+  return 0;
+}
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 67b0dbf..378c3cd 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -54,12 +54,11 @@ size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
+# if __ARM_FEATURE_MEMORY_TAGGING
+void *__mtag_tag_region (void *, size_t);
+void *__mtag_tag_zero_region (void *, size_t);
+# endif
 #elif __arm__
-#define __memcpy_arm memcpy
-#define __memchr_arm memchr
-#define __strcpy_arm strcpy
-#define __strcmp_arm strcmp
-#define __strlen_armv6t2 strlen
 void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
 void *__memset_arm (void *, int, size_t);
 void *__memchr_arm (const void *, int, size_t);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
new file mode 100644
index 0000000..d8c02d9
--- /dev/null
+++ b/string/test/__mtag_tag_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a';
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 'a')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
new file mode 100644
index 0000000..221c223
--- /dev/null
+++ b/string/test/__mtag_tag_zero_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_zero_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_zero_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 0)
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index dd93698..7a7cf9c 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -1,7 +1,7 @@
 /*
  * memcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 346d920..ce0ceee 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,7 +1,7 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memmove.c b/string/test/memmove.c
index af92fe3..689b68c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,7 +1,7 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memset.c b/string/test/memset.c
index cebe9ad..f172144 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -1,7 +1,7 @@
 /*
  * memset test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 4e718e3..d57b54e 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,7 +1,7 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 23fbb0a..018a8a4 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,7 +1,7 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index b968457..fedbdc5 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -1,7 +1,7 @@
 /*
  * strrchr test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -91,7 +91,7 @@ test (const struct fun *fun, int align, int seekpos, int len)
   if (p != s + len)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
-	   fun->name, s, 0, len, p, f, len);
+	   fun->name, s, 0, len, p, s + len, len);
       quote ("input", s, len);
     }
 }
-- 
Gitee