diff --git a/Makefile b/Makefile
index 89fc13bd495c48a742d6fe6d0202d945d38a83b5..169f89e2c9d6be3f53a91780447652ee7917b28e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 srcdir = .
diff --git a/README b/README
index 6cde5cdae42af60be9410a754f850f8108290862..9e1a34fdc65d9acd27964255a42211af5ef06efa 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
 to projects that require copyright assignment is possible.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.05.
+release is v21.02.
 
 Source code layout:
 
diff --git a/README.OpenSource b/README.OpenSource
index 45a839b409ca5a6ce7f8ff803683173a154cfcad..0e874ba9c123e151ae6afc3d918443d4986802fc 100644
--- a/README.OpenSource
+++ b/README.OpenSource
@@ -3,9 +3,9 @@
         "Name"                  : "optimized-routines",
         "License"               : "MIT License",
         "License File"          : "LICENSE",
-        "Version Number"        : "v20.05",
+        "Version Number"        : "v21.02",
         "Owner"                 : "zhaotianyu9@huawei.com",
-        "Upstream URL"          : "https://www.mirbsd./mksh.ht://www.arm.com/;https://github.com/ARM-software/optimized-routines",
+        "Upstream URL"          : "https://github.com/ARM-software/optimized-routines",
         "Description"           : "Optimized implementations of various library functions for ARM architecture processors"
     }
 ]
diff --git a/config.mk.dist b/config.mk.dist
index 3e55c988bc8f95e693e360450b0b3b4552337939..177e1ac4f53a3e14772a7560f7f79eba86ffe5e7 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,6 +1,6 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2019, Arm Limited.
+# Copyright (c) 2018-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 # Subprojects to build
diff --git a/math/cosf.c b/math/cosf.c
index 831b39e85e766b2851135d63bbd5a858084b7b87..f29f19474e230327f439da21eb0661e53bfaa1fe 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/erf.c b/math/erf.c
new file mode 100644
index 0000000000000000000000000000000000000000..12d7e5160df702ab10ff1ae5da5604c927e54372
--- /dev/null
+++ b/math/erf.c
@@ -0,0 +1,244 @@
+/*
+ * Double-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+#include <math.h>
+#include <stdint.h>
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
+#define C 0x1.b0ac16p-1
+#define PA __erf_data.erf_poly_A
+#define NA __erf_data.erf_ratio_N_A
+#define DA __erf_data.erf_ratio_D_A
+#define NB __erf_data.erf_ratio_N_B
+#define DB __erf_data.erf_ratio_D_B
+#define PC __erf_data.erfc_poly_C
+#define PD __erf_data.erfc_poly_D
+#define PE __erf_data.erfc_poly_E
+#define PF __erf_data.erfc_poly_F
+
+/* Top 32 bits of a double.  */
+static inline uint32_t
+top32 (double x)
+{
+  return asuint64 (x) >> 32;
+}
+
+/* Fast erf implementation using a mix of
+   rational and polynomial approximations.
+   Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  */
+double
+erf (double x)
+{
+  /* Get top word and sign.  */
+  uint32_t ix = top32 (x);
+  uint32_t ia = ix & 0x7fffffff;
+  uint32_t sign = ix >> 31;
+
+  /* Normalized and subnormal cases */
+  if (ia < 0x3feb0000)
+    { /* a = |x| < 0.84375.  */
+
+      if (ia < 0x3e300000)
+	{ /* a < 2^(-28).  */
+	  if (ia < 0x00800000)
+	    { /* a < 2^(-1015).  */
+	      double y =  fma (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflow (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      double x2 = x * x;
+
+      if (ia < 0x3fe00000)
+	{ /* a < 0.5  - Use polynomial approximation.  */
+	  double r1 = fma (x2, PA[1], PA[0]);
+	  double r2 = fma (x2, PA[3], PA[2]);
+	  double r3 = fma (x2, PA[5], PA[4]);
+	  double r4 = fma (x2, PA[7], PA[6]);
+	  double r5 = fma (x2, PA[9], PA[8]);
+	  double x4 = x2 * x2;
+	  double r = r5;
+	  r = fma (x4, r, r4);
+	  r = fma (x4, r, r3);
+	  r = fma (x4, r, r2);
+	  r = fma (x4, r, r1);
+	  return fma (r, x, x); /* This fma is crucial for accuracy.  */
+	}
+      else
+	{ /* 0.5 <= a < 0.84375 - Use rational approximation.  */
+	  double x4, x8, r1n, r2n, r1d, r2d, r3d;
+
+	  r1n = fma (x2, NA[1], NA[0]);
+	  x4 = x2 * x2;
+	  r2n = fma (x2, NA[3], NA[2]);
+	  x8 = x4 * x4;
+	  r1d = fma (x2, DA[0], 1.0);
+	  r2d = fma (x2, DA[2], DA[1]);
+	  r3d = fma (x2, DA[4], DA[3]);
+	  double P = r1n + x4 * r2n + x8 * NA[4];
+	  double Q = r1d + x4 * r2d + x8 * r3d;
+	  return fma (P / Q, x, x);
+	}
+    }
+  else if (ia < 0x3ff40000)
+    { /* 0.84375 <= |x| < 1.25.  */
+      double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
+      double a = fabs (x) - 1.0;
+      r1n = fma (a, NB[1], NB[0]);
+      a2 = a * a;
+      r1d = fma (a, DB[0], 1.0);
+      a4 = a2 * a2;
+      r2n = fma (a, NB[3], NB[2]);
+      a6 = a4 * a2;
+      r2d = fma (a, DB[2], DB[1]);
+      r3n = fma (a, NB[5], NB[4]);
+      r3d = fma (a, DB[4], DB[3]);
+      r4n = NB[6];
+      r4d = DB[5];
+      double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
+      double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
+      if (sign)
+	return -C - P / Q;
+      else
+	return C + P / Q;
+    }
+  else if (ia < 0x40000000)
+    { /* 1.25 <= |x| < 2.0.  */
+      double a = fabs (x);
+      a = a - 1.25;
+
+      double r1 = fma (a, PC[1], PC[0]);
+      double r2 = fma (a, PC[3], PC[2]);
+      double r3 = fma (a, PC[5], PC[4]);
+      double r4 = fma (a, PC[7], PC[6]);
+      double r5 = fma (a, PC[9], PC[8]);
+      double r6 = fma (a, PC[11], PC[10]);
+      double r7 = fma (a, PC[13], PC[12]);
+      double r8 = fma (a, PC[15], PC[14]);
+
+      double a2 = a * a;
+
+      double r = r8;
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x400a0000)
+    { /* 2 <= |x| < 3.25.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -1.0);
+
+      double r1 = fma (a, PD[1], PD[0]);
+      double r2 = fma (a, PD[3], PD[2]);
+      double r3 = fma (a, PD[5], PD[4]);
+      double r4 = fma (a, PD[7], PD[6]);
+      double r5 = fma (a, PD[9], PD[8]);
+      double r6 = fma (a, PD[11], PD[10]);
+      double r7 = fma (a, PD[13], PD[12]);
+      double r8 = fma (a, PD[15], PD[14]);
+      double r9 = fma (a, PD[17], PD[16]);
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x40100000)
+    { /* 3.25 <= |x| < 4.0.  */
+      double a = fabs (x);
+      a = a - 3.25;
+
+      double r1 = fma (a, PE[1], PE[0]);
+      double r2 = fma (a, PE[3], PE[2]);
+      double r3 = fma (a, PE[5], PE[4]);
+      double r4 = fma (a, PE[7], PE[6]);
+      double r5 = fma (a, PE[9], PE[8]);
+      double r6 = fma (a, PE[11], PE[10]);
+      double r7 = fma (a, PE[13], PE[12]);
+
+      double a2 = a * a;
+
+      double r = r7;
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else if (ia < 0x4017a000)
+    { /* 4 <= |x| < 5.90625.  */
+      double a = fabs (x);
+      a = fma (0.5, a, -2.0);
+
+      double r1 = fma (a, PF[1], PF[0]);
+      double r2 = fma (a, PF[3], PF[2]);
+      double r3 = fma (a, PF[5], PF[4]);
+      double r4 = fma (a, PF[7], PF[6]);
+      double r5 = fma (a, PF[9], PF[8]);
+      double r6 = fma (a, PF[11], PF[10]);
+      double r7 = fma (a, PF[13], PF[12]);
+      double r8 = fma (a, PF[15], PF[14]);
+      double r9 = PF[16];
+
+      double a2 = a * a;
+
+      double r = r9;
+      r = fma (a2, r, r8);
+      r = fma (a2, r, r7);
+      r = fma (a2, r, r6);
+      r = fma (a2, r, r5);
+      r = fma (a2, r, r4);
+      r = fma (a2, r, r3);
+      r = fma (a2, r, r2);
+      r = fma (a2, r, r1);
+
+      if (sign)
+	return -1.0 + r;
+      else
+	return 1.0 - r;
+    }
+  else
+    {
+      /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1.  */
+      if (unlikely (ia >= 0x7ff00000))
+	return (double) (1.0 - (sign << 1)) + 1.0 / x;
+
+      if (sign)
+	return -1.0;
+      else
+	return 1.0;
+    }
+}
diff --git a/math/erf_data.c b/math/erf_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..807875bdd7f5db86ad3557c9c36c7afd93c07ca0
--- /dev/null
+++ b/math/erf_data.c
@@ -0,0 +1,85 @@
+/*
+ * Shared data between erf and erfc.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/*
+Minimax approximation of erf
+*/
+const struct erf_data __erf_data = {
+.erf_poly_A = {
+#if ERF_POLY_A_NCOEFFS == 10
+0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
+-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
+0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
+-0x1.18c47fd143c5ep-23
+#endif
+},
+/* Rational approximation on [0x1p-28, 0.84375] */
+.erf_ratio_N_A = {
+0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
+-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
+},
+.erf_ratio_D_A = {
+0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
+0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
+},
+/* Rational approximation on [0.84375, 1.25] */
+.erf_ratio_N_B = {
+-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
+0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
+-0x1.1bf380a96073fp-9
+},
+.erf_ratio_D_B = {
+0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
+0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
+},
+.erfc_poly_C = {
+#if ERFC_POLY_C_NCOEFFS == 16
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
+0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
+-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
+-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
+-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
+0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
+-0x1.578c9e375d37p-19
+#endif
+},
+.erfc_poly_D = {
+#if ERFC_POLY_D_NCOEFFS == 18
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
+0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
+-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
+-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
+0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
+-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
+0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
+#endif
+},
+.erfc_poly_E = {
+#if ERFC_POLY_E_NCOEFFS == 14
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
+0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
+-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
+0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
+-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
+-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
+#endif
+},
+.erfc_poly_F = {
+#if ERFC_POLY_F_NCOEFFS == 17
+/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
+0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
+-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
+0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
+-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
+0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
+-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
+#endif
+}
+};
+
diff --git a/math/erff.c b/math/erff.c
new file mode 100644
index 0000000000000000000000000000000000000000..a58e82565dc34745500197c469d7f2ea9ec1f71b
--- /dev/null
+++ b/math/erff.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "math_config.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define A __erff_data.erff_poly_A
+#define B __erff_data.erff_poly_B
+
+/* Top 12 bits of a float.  */
+static inline uint32_t
+top12 (float x)
+{
+  return asuint (x) >> 20;
+}
+
+/* Efficient implementation of erff
+   using either a pure polynomial approximation or
+   the exponential of a polynomial.
+   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
+float
+erff (float x)
+{
+  float r, x2, u;
+
+  /* Get top word.  */
+  uint32_t ix = asuint (x);
+  uint32_t sign = ix >> 31;
+  uint32_t ia12 = top12 (x) & 0x7ff;
+
+  /* Limit of both intervals is 0.875 for performance reasons but coefficients
+     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
+     from 0.94 to 1.1ulps.  */
+  if (ia12 < 0x3f6)
+    { /* a = |x| < 0.875.  */
+
+      /* Tiny and subnormal cases.  */
+      if (unlikely (ia12 < 0x318))
+	{ /* |x| < 2^(-28).  */
+	  if (unlikely (ia12 < 0x040))
+	    { /* |x| < 2^(-119).  */
+	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
+	      return check_uflowf (y);
+	    }
+	  return x + TwoOverSqrtPiMinusOne * x;
+	}
+
+      x2 = x * x;
+
+      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
+      r = A[5];
+      r = fmaf (r, x2, A[4]);
+      r = fmaf (r, x2, A[3]);
+      r = fmaf (r, x2, A[2]);
+      r = fmaf (r, x2, A[1]);
+      r = fmaf (r, x2, A[0]);
+      r = fmaf (r, x, x);
+    }
+  else if (ia12 < 0x408)
+    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
+
+      float a = fabsf (x);
+      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
+      r = fmaf (B[6], a, B[5]);
+      u = fmaf (B[4], a, B[3]);
+      x2 = x * x;
+      r = fmaf (r, x2, u);
+      /* Then switch to pure Horner scheme.  */
+      r = fmaf (r, a, B[2]);
+      r = fmaf (r, a, B[1]);
+      r = fmaf (r, a, B[0]);
+      r = fmaf (r, a, a);
+      /* Single precision exponential with ~0.5ulps,
+	 ensures erff has max. rel. error
+	 < 1ulp on [0.921875, 4.0],
+	 < 1.1ulps on [0.875, 4.0].  */
+      r = expf (-r);
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f + r;
+      else
+	r = 1.0f - r;
+    }
+  else
+    { /* |x| >= 4.0.  */
+
+      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
+      if (unlikely (ia12 >= 0x7f8))
+	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
+
+      /* Explicit copysign (calling copysignf increases latency).  */
+      if (sign)
+	r = -1.0f;
+      else
+	r = 1.0f;
+    }
+  return r;
+}
diff --git a/math/erff_data.c b/math/erff_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..fa6b1ef4dedbfe7bafe493aa7c0dc007174fe704
--- /dev/null
+++ b/math/erff_data.c
@@ -0,0 +1,22 @@
+/*
+ * Data for approximation of erff.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config.h"
+
+/* Minimax approximation of erff. */
+const struct erff_data __erff_data = {
+.erff_poly_A = {
+0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
+-0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
+},
+.erff_poly_B = {
+0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
+-0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
+0x1.222900p-16f
+}
+};
+
diff --git a/math/exp.c b/math/exp.c
index 1909b8ea435b830feac8202c565cdb007b942682..7f5024cd8792144fe2681f1a60e297d405b9ea06 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision e^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/exp2.c b/math/exp2.c
index 47aa47902eca6b5ed2cc0e346ff6d4b9aab917d4..35ab39f22ed5fcb0442c2fb84eea80ff95540fe2 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision 2^x function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/expf.c b/math/expf.c
index 0fe1f7d20efd58d44e5acfec2db0a466bd8ce42b..9b2f0c3d8c56c98d8e9d37d45143b713cb92e570 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision e^x function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 449300839b67ec703ae508a81e09c79dc1e6da2a..279d829d8ea15acae38ae51ada3fa74f3920f7f5 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2019, Arm Limited.
+ * Copyright (c) 2015-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log.c b/math/log.c
index b85d3ff52630c5da1366067b904bd7a296389961..d3b7bc60747c2ace661ed1885669b1ab763e4dd2 100644
--- a/math/log.c
+++ b/math/log.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/log2.c b/math/log2.c
index 804fb85d3ea6d8a1d7f0881556282c5d61037e9a..55102b7729696324f1f2afb4cf4cd89fbd06c034 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision log2(x) function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf.c b/math/logf.c
index ee3120a3749bd40c757f189f846dadcbdc195931..cfbaee12df108750f6de0ca9f8dd30be7a17ff2b 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/logf_data.c b/math/logf_data.c
index 53c5f624f7bbf5302180fd33337756acffc564b5..e8973ce4fedcbffc2d587bf73fd2afa3917331ca 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for logf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/math_config.h b/math/math_config.h
index 85fc58478f3a4f13ebfc31997a65505bfdc71d5a..e85104337048abdfb1f51302fe7b3d33ead2b06a 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -298,6 +298,24 @@ check_uflow (double x)
   return WANT_ERRNO ? __math_check_uflow (x) : x;
 }
 
+/* Check if the result overflowed to infinity.  */
+HIDDEN float __math_check_oflowf (float);
+/* Check if the result underflowed to 0.  */
+HIDDEN float __math_check_uflowf (float);
+
+/* Check if the result overflowed to infinity.  */
+static inline float
+check_oflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_oflowf (x) : x;
+}
+
+/* Check if the result underflowed to 0.  */
+static inline float
+check_uflowf (float x)
+{
+  return WANT_ERRNO ? __math_check_uflowf (x) : x;
+}
 
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
@@ -416,4 +434,29 @@ extern const struct pow_log_data
   struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
 } __pow_log_data HIDDEN;
 
+extern const struct erff_data
+{
+  float erff_poly_A[6];
+  float erff_poly_B[7];
+} __erff_data HIDDEN;
+
+#define ERF_POLY_A_ORDER 19
+#define ERF_POLY_A_NCOEFFS 10
+#define ERFC_POLY_C_NCOEFFS 16
+#define ERFC_POLY_D_NCOEFFS 18
+#define ERFC_POLY_E_NCOEFFS 14
+#define ERFC_POLY_F_NCOEFFS 17
+extern const struct erf_data
+{
+  double erf_poly_A[ERF_POLY_A_NCOEFFS];
+  double erf_ratio_N_A[5];
+  double erf_ratio_D_A[5];
+  double erf_ratio_N_B[7];
+  double erf_ratio_D_B[6];
+  double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
+  double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
+  double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
+  double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
+} __erf_data HIDDEN;
+
 #endif
diff --git a/math/math_errf.c b/math/math_errf.c
index 07154c57d4fb8341840997c39ee0e2f943de3b77..d5350b819ab1aa4c37f61616e2d54b77027520fd 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision math error handling.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -64,3 +64,17 @@ __math_invalidf (float x)
   float y = (x - x) / (x - x);
   return isnan (x) ? y : with_errnof (y, EDOM);
 }
+
+/* Check result and set errno if necessary.  */
+
+HIDDEN float
+__math_check_uflowf (float y)
+{
+  return y == 0.0f ? with_errnof (y, ERANGE) : y;
+}
+
+HIDDEN float
+__math_check_oflowf (float y)
+{
+  return isinf (y) ? with_errnof (y, ERANGE) : y;
+}
diff --git a/math/pow.c b/math/pow.c
index ced7c4fe6f675622cea5af4ac628277426fafc09..86842c6abacd962b4df3f536229c977b9d167775 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -1,7 +1,7 @@
 /*
  * Double-precision x^y function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf.c b/math/powf.c
index 1534a09b81cb1ec508efe818f3ee0abdf762f361..6ba45d3852a50b1ae3decb93e291a6d285692e5e 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision pow function.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index b9fbdc4de003b0fada415bc70cb394013afbb4b7..97e0d98cdbab6ffa9358a9670acd5c1255c02799 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for powf.
  *
- * Copyright (c) 2017-2018, Arm Limited.
+ * Copyright (c) 2017-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf.c b/math/sincosf.c
index e6cd41ea3f1375f9b7a629882af733c894d8cba2..9746f1c22e6c2b30a2003e649fcfd40ebd8bcc7c 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index 5d0b58ece38c203a0cfb653b48230f2955c0a6dc..ab4ac4710feff2468cf9e55b04d4ad22dbc75233 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -1,7 +1,7 @@
 /*
  * Data definition for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/sinf.c b/math/sinf.c
index 770b294c2379110a5719847badbb682935a754dd..ddbc1daf74a9df1d90dad824f3c30d0460aafcc2 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,7 +1,7 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 33ceda3837136607a7aaa31678b9b9d20c073683..0c17826e52961b3abd86b1e53ab3ec4a74d7ed8e 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -248,6 +248,7 @@ D (log2, 0.999, 1.001)
 {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
 D (xpow, 0.01, 11.1)
 D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
 
 F (dummyf, 1.0, 2.0)
 F (expf, -9.9, 9.9)
@@ -275,6 +276,7 @@ F (cosf, -3.1, 3.1)
 F (cosf, 3.3, 33.3)
 F (cosf, 100, 1000)
 F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
 #if WANT_VMATH
 D (__s_sin, -3.1, 3.1)
 D (__s_cos, -3.1, 3.1)
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 2ff8c3f4e5cc7544b2e20e3b49ca3235c83532ec..310896738e478481a9f91ff878957a1f86accc2e 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index f416477909b89f53ba4ed0b2e556bd0d440015ac..6be79e1df0d1acef5a5c3861f1ab73058e10b836 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -1,7 +1,7 @@
 /*
  * dotest.c - actually generate mathlib test cases
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index af574b0ff90f433a5f541c5bdac7cd8e84914c44..12a9c749e18e1127eb27922ad30d13ac3cbd4d1c 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -1,7 +1,7 @@
 /*
  * intern.h
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index e94e45585aee4526b1a75928173aec70fb3ad913..0d8ead891320a5c5afb3d72d7b7fbd82c5f6e540 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -1,7 +1,7 @@
 /*
  * main.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index e97a8c648e78d03e690b8886639a057b1f7fa63a..56123966b8c48f8acbeb1501d1e56d5b1d5e3e2c 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -1,7 +1,7 @@
 /*
  * random.c - random number generator for producing mathlib test cases
  *
- * Copyright (c) 1998-2018, Arm Limited.
+ * Copyright (c) 1998-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index c1ce956b0f1c75d663b27d4a0fc8e992df5e8e36..b4b22df82a3d768bdb8227f6731b0bce5d6ce843 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -1,7 +1,7 @@
 /*
  * random.h - header for random.c
  *
- * Copyright (c) 2009-2018, Arm Limited.
+ * Copyright (c) 2009-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index 938dc3aebdf6ac768e6e7cccb2a51b041e2f2165..c9f0daf76508194f5443bfe9fccbbfb89fe8964d 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -1,7 +1,7 @@
 /*
  * semi.c: test implementations of mathlib seminumerical functions
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index da473a2b70fe983271cebd1793776c108229920d..17dc4158fb51e87e465c76ca5c9192cfb0dee71b 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -1,7 +1,7 @@
 /*
  * semi.h: header for semi.c
  *
- * Copyright (c) 1999-2018, Arm Limited.
+ * Copyright (c) 1999-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 1a76c2e50f4d8611b8322a31c126ae64177fc54b..53cd557fa4cf448d6d4f49dbd85cf8c514905d47 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -1,7 +1,7 @@
 /*
  * types.h
  *
- * Copyright (c) 2005-2018, Arm Limited.
+ * Copyright (c) 2005-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index acaf67141d114d6fc645d1d0790a458f5171fa29..de45ac5768d0f750c1ad48c15902a02df9a8336d 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -1,7 +1,7 @@
 /*
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 5804935b9246958fe9e369760b0ca59f5c9edd06..7b09c85a59f114af56f6ec7dd9e0e7c00bd43721 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -1,7 +1,7 @@
 /*
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
- * Copyright (c) 2014-2018, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
old mode 100644
new mode 100755
index a8c391b5584f841835b80dd1ed4ae243b76af660..0190d9ab27fb104de780d9101507a85ee9ff7a2e
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019, Arm Limited.
+# Copyright (c) 2019-2020, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 #set -x
@@ -72,6 +72,16 @@ t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
+L=1.0
+Ldir=0.9
+t erf  0 0xffff000000000000 10000
+t erf  0x1p-1022  0x1p-26   40000
+t erf  -0x1p-1022 -0x1p-26  40000
+t erf  0x1p-26    0x1p3     40000
+t erf  -0x1p-26  -0x1p3     40000
+t erf  0         inf        40000
+Ldir=0.5
+
 L=0.01
 t expf  0    0xffff0000    10000
 t expf  0x1p-14   0x1p8    50000
@@ -119,6 +129,17 @@ t powf  0x1p-70 0x1p70  x  0x1p-1 0x1p1   50000
 t powf  0x1p-70 0x1p70  x  -0x1p-1 -0x1p1 50000
 t powf  0x1.ep-1 0x1.1p0 x  0x1p8 0x1p14  50000
 t powf  0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
+
+L=0.6
+Ldir=0.9
+t erff  0      0xffff0000 10000
+t erff  0x1p-127  0x1p-26 40000
+t erff -0x1p-127 -0x1p-26 40000
+t erff  0x1p-26   0x1p3   40000
+t erff -0x1p-26  -0x1p3   40000
+t erff  0         inf     40000
+Ldir=0.5
+
 done
 
 # vector functions
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 5dc0994081cde250a6058085363328090764b794..79160443f0990058f70bc0d03be6be545f3fd6f7 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,6 +1,6 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
new file mode 100644
index 0000000000000000000000000000000000000000..7fa4d1868c0eb1a27920eda1485ee7be4dbe0f01
--- /dev/null
+++ b/math/test/testcases/directed/erf.tst
@@ -0,0 +1,17 @@
+; erf.tst - Directed test cases for erf
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
+func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
+func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
+func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
+func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
+func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
+func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
+func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
new file mode 100644
index 0000000000000000000000000000000000000000..d05b7b1119c46c21ce7d22d2d3f2cbebff6eae44
--- /dev/null
+++ b/math/test/testcases/directed/erff.tst
@@ -0,0 +1,17 @@
+; erff.tst
+;
+; Copyright (c) 2007-2020, Arm Limited.
+; SPDX-License-Identifier: MIT
+
+func=erff op1=7fc00001 result=7fc00001 errno=0
+func=erff op1=ffc00001 result=7fc00001 errno=0
+func=erff op1=7f800001 result=7fc00001 errno=0 status=i
+func=erff op1=ff800001 result=7fc00001 errno=0 status=i
+func=erff op1=7f800000 result=3f800000 errno=0
+func=erff op1=ff800000 result=bf800000 errno=0
+func=erff op1=00000000 result=00000000 errno=ERANGE
+func=erff op1=80000000 result=80000000 errno=ERANGE
+func=erff op1=00000001 result=00000001 errno=0 status=ux
+func=erff op1=80000001 result=80000001 errno=0 status=ux
+func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
+func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index addfc0ae29f4ff87e8fd4866805835b2e3d92aa0..85d556cd1e00f75c3273e67d420adce2ea7849df 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index 04a5a506ea53506e4172ca29cb31a2578f1a3488..fa56c9f8be4b91598121f7f376e68968d806001d 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for exp2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 2b6a9b5b63d34e67790ec94ee504990d16cbe7eb..38cfc3f78ac61dae04c0d0372110d3351e669848 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,6 +1,6 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index 74664c7968b2d67ac0743602fe484e9f094032b4..ff0f671c2656a94b17f96d8f9a683a6b8436f674 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,6 +1,6 @@
 ; expf.tst - Directed test cases for expf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index eeb762cecefe15cffaf11370fe23aa32909e2b82..a0aa398cbf734396be64c61612f463524d283d15 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index e0765d8adf1a239b2a2d7e72499c035f024ccfcc..ff1286cbd53e8ebfba5db81b9d244a598eb9a6ac 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for log2
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 8d685baba9346322758379ed9cff5b5595a8baf4..5832c4f08f1ecb6acf9fdbbb06f0ce75bac82f6d 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,6 +1,6 @@
 ; log2f.tst - Directed test cases for log2f
 ;
-; Copyright (c) 2017-2018, Arm Limited.
+; Copyright (c) 2017-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 7ccc873aecc286ef9c68f57efc50cc190648d719..6e68a36e0f6a29f8d0646f450ed3abf0aafca260 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,6 +1,6 @@
 ; logf.tst - Directed test cases for logf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index a4c42be60bcccb04b93893b2fb4c0661b778c9f7..19665817153d03ef84dc85fe3be375bd63d2dad5 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for pow
 ;
-; Copyright (c) 2018, Arm Limited.
+; Copyright (c) 2018-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index efd1dd5ea357c679bad64a4945743b05cbc55386..3fa8b110f8bcb97196dca92030271ceb376644a8 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,6 +1,6 @@
 ; powf.tst - Directed test cases for powf
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index b4b2526062e02f4cd453b7a1c4022f0b5fc478d4..4b33d2291c660c034fed47522966599203bb8b6c 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,6 +1,6 @@
 ; Directed test cases for SP sincos
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index 13cfdca2b5582a1ce1096d42375f649ca35fbb9a..ded80b1598c6a3904ed8eb6baab351f493592bcc 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,6 +1,6 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
-; Copyright (c) 2007-2018, Arm Limited.
+; Copyright (c) 2007-2019, Arm Limited.
 ; SPDX-License-Identifier: MIT
 
 
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c37e837014cec768bf0a389c07236e568bd6c89b..c24ff80d5d95eccc799de5bd3dd0876b19ae8fb9 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,6 +1,6 @@
 !! double.tst - Random test case specification for DP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test exp 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index baf62b9d8c6408dfcd3977ef335603d00c0f24ab..d02a22750abe07b9b64b63a0caf9afcb3c83d50c 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,6 +1,6 @@
 !! single.tst - Random test case specification for SP functions
 !!
-!! Copyright (c) 1999-2018, Arm Limited.
+!! Copyright (c) 1999-2019, Arm Limited.
 !! SPDX-License-Identifier: MIT
 
 test sinf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 371567af7de3257c717b4b722d7d0e222fa3b630..51479b87a0fde860e1584536fd13b8471cfca9a2 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,7 +1,7 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -331,11 +331,13 @@ static const struct fun fun[] = {
  F1 (log)
  F1 (log2)
  F2 (pow)
+ F1 (erf)
  D1 (exp)
  D1 (exp2)
  D1 (log)
  D1 (log2)
  D2 (pow)
+ D1 (erf)
 #if WANT_VMATH
  F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
  F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
diff --git a/math/tools/plot.py b/math/tools/plot.py
old mode 100644
new mode 100755
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
old mode 100644
new mode 100755
index f479fc54d10bb7d295fbacc868c876a052a615bc..2ff436f5287ff2d426413f6817a966ac82990439
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -3,7 +3,7 @@
 
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
-# Copyright (c) 2015-2018, Arm Limited.
+# Copyright (c) 2015-2019, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 import Base.\
diff --git a/math/v_math.h b/math/v_math.h
index 3db22e5b3c122e463f309168a5f659c25a18015e..f2cc4670bb9b8524c0318952b3e0a417a73746b1 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,7 +1,7 @@
 /*
  * Vector math abstractions.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 50722a4f5ff5fc89edaed1f167a50d8f108210f5..41b98120f2758b54b8d13122caffb00224cc3139 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -1,7 +1,7 @@
 /*
  * Ones' complement checksum test & benchmark
  *
- * Copyright 2016-2020 ARM Limited
+ * Copyright (c) 2016-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/Dir.mk b/string/Dir.mk
index ae7c673a17355762dc0f911904d578e54110f5be..cf3453f7580d381464b4ebb5eacfe1306a427822 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,6 +1,6 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2020, Arm Limited.
+# Copyright (c) 2019-2021, Arm Limited.
 # SPDX-License-Identifier: MIT
 
 S := $(srcdir)/string
@@ -29,6 +29,8 @@ string-tests := \
 	build/bin/test/memchr \
 	build/bin/test/memrchr \
 	build/bin/test/memcmp \
+	build/bin/test/__mtag_tag_region \
+	build/bin/test/__mtag_tag_zero_region \
 	build/bin/test/strcpy \
 	build/bin/test/stpcpy \
 	build/bin/test/strcmp \
@@ -39,7 +41,9 @@ string-tests := \
 	build/bin/test/strnlen \
 	build/bin/test/strncmp
 
-string-benches := build/bin/bench/memcpy
+string-benches := \
+	build/bin/bench/memcpy \
+	build/bin/bench/strlen
 
 string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
 string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
@@ -95,6 +99,7 @@ check-string: $(string-tests-out)
 	! grep FAIL $^
 
 bench-string: $(string-benches)
+	$(EMULATOR) build/bin/bench/strlen
 	$(EMULATOR) build/bin/bench/memcpy
 
 install-string: \
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
new file mode 100644
index 0000000000000000000000000000000000000000..84339f73cf23770b991c15e62eaba4b186a3201e
--- /dev/null
+++ b/string/aarch64/__mtag_tag_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_region - tag memory
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stg	dstin, [dstin]
+	stg	dstin, [tmp]
+	stg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	st2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	st2g	dstin, [dstin]
+	st2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	st2g	dstin, [dst, 32]
+	st2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	st2g	dstin, [dstend, -64]
+	st2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_region)
+#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
new file mode 100644
index 0000000000000000000000000000000000000000..f58364ca6fcb8c11b548b4288efdd21c716d5866
--- /dev/null
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -0,0 +1,100 @@
+/*
+ * __mtag_tag_zero_region - tag memory and fill it with zero bytes
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MTE, LP64 ABI.
+ *
+ * Interface contract:
+ * Address is 16 byte aligned and size is multiple of 16.
+ * Returns the passed pointer.
+ * The memory region may remain untagged if tagging is not enabled.
+ */
+
+#include "../asmdefs.h"
+
+#if __ARM_FEATURE_MEMORY_TAGGING
+
+#define dstin	x0
+#define count	x1
+#define dst	x2
+#define dstend	x3
+#define tmp	x4
+#define zva_val	x4
+
+ENTRY (__mtag_tag_zero_region)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+
+	tbnz	count, 6, L(set96)
+
+	/* Set 0, 16, 32, or 48 bytes.  */
+	lsr	tmp, count, 5
+	add	tmp, dstin, tmp, lsl 4
+	cbz     count, L(end)
+	stzg	dstin, [dstin]
+	stzg	dstin, [tmp]
+	stzg	dstin, [dstend, -16]
+L(end):
+	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Size is > 96 bytes.  */
+L(set_long):
+	cmp	count, 160
+	b.lo	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	stz2g	dstin, [dstin]
+	stz2g	dstin, [dstin, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	gzva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	dst, dstin, 32		/* Dst is biased by -32.  */
+	sub	count, count, 64	/* Adjust count for loop.  */
+L(no_zva_loop):
+	stz2g	dstin, [dst, 32]
+	stz2g	dstin, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stz2g	dstin, [dstend, -64]
+	stz2g	dstin, [dstend, -32]
+	ret
+
+END (__mtag_tag_zero_region)
+#endif
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 31ad0507bdbd44f28b90425a54a3696721f1a724..c2e967d1004e06e372725f5cc8ddb95aeb629aa2 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -44,6 +44,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index 4a5c72623abfd0ec04021d507503671b28ae545e..c22e6596f19bdde2e6ced26a3ca11e99c0c5b7f5 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,15 +14,14 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN(__memchr_aarch64_sve, 4)
+ENTRY (__memchr_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
-	nop
 
+	.p2align 4
 0:	whilelo	p1.b, x3, x2			/* make sure off < max */
 	b.none	9f
 
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index dfba79f70be53f52870ebc9a36045dcf40d0765a..353f0d1eac53098f8b8e921d12af1404ec2cf96c 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,8 @@
  */
 
 ENTRY (__memchr_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 8a0a2ea2d4e0b09f18bb2c2ad715fd0c8310e053..78c5ecaa4cdcba0b826d62369d40f18afa8313d9 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,10 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__memcmp_aarch64_sve, 4)
+ENTRY (__memcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index dac9147c57a0401611409961d48401aac72860b6..3b1026642eee805ca31d7f88b13eac082ce4b726 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -28,6 +28,9 @@
 #define tmp2		x8
 
 ENTRY (__memcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 3004179bf676bfc80a71f029618f7b8c21f304fa..f97f2c3047b96e489ff97395173f2069469144e0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -52,6 +52,9 @@
 
 ENTRY_ALIAS (__memmove_aarch64_simd)
 ENTRY (__memcpy_aarch64_simd)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -179,12 +182,13 @@ L(copy_long_backwards):
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
-	stp	A_q, B_q, [dstend, -32]
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
 	ldp	A_q, B_q, [srcend, -96]
-	stp	C_q, D_q, [dstend, -64]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
 	ldp	C_q, D_q, [srcend, -128]
 	sub	srcend, srcend, 64
-	sub	dstend, dstend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 157bb0df061679762b37f7addcfb367a52394a57..dd254f6f9929a3986a0c60473df66d573d836092 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -55,6 +55,9 @@
 
 ENTRY_ALIAS (__memmove_aarch64)
 ENTRY (__memcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index ad42b49fcd8e72ad7787eb7f08aa2e1d4ad884cf..7b4be847cecbf93820be6ca931cf6b4569bf382f 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -46,6 +46,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
+	PTR_ARG (0)
 	add	end, srcin, cntin
 	sub	endm1, end, 1
 	bic	src, endm1, 15
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 27743f1da2c902f39d94f445876f2ed2fa8e628a..9fcd97579913b025028f6728098ebd570992cb7d 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -22,6 +22,8 @@
 #define zva_val	x5
 
 ENTRY (__memset_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -37,7 +39,7 @@ ENTRY (__memset_aarch64)
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
-	nop
+	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 577752e0733caa3a9897bf548ce18b1534a2156c..dcb0e46258709760e7ef1c7d81e47a86457a2846 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -43,6 +43,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchr_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 495beda347b32d66a3a4b642496f0bd80a3d3f2e..13ba9f44f9c5a3dd716252b0459955cfe12c3b18 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,9 +14,6 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file.  */
 #ifdef BUILD_STRCHRNUL
 #define FUNC  __strchrnul_aarch64_sve
@@ -24,7 +21,8 @@
 #define FUNC  __strchr_aarch64_sve
 #endif
 
-ENTRY_ALIGN (FUNC, 4)
+ENTRY (FUNC)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 8d8e3fc6811c37adab3a6d09b15ccd36e0a332ee..1063cbfd77aa817ed1502e0b2c39643fb102c16b 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,7 +1,7 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -51,6 +51,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0xc0300c03 to allow us to identify which lane
 	   matches the requested byte.  Even bits are set if the character
 	   matches, odd bits if either the char is NUL or matches.  */
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 0dbf0dccd733c4c7a4754b0bb6ba1397888add21..1b0d0a63094c6567c3ee3654b416635f28a8acfd 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -41,6 +41,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 5140e59ce9c84e278ae0c571957468e22b08b754..428ff1a3d008325778eccc4e9fe1ec99bfc70bb5 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2019, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 45be15ceab0c75ad8d3fabf149ddaf1f8dbbb4df..a4230d919b478d3001d412a7b3574f7ec94d2fb1 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,7 +1,7 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -47,6 +47,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
index 8f2abc40c939b2cdc2f6b251ece3b1859653cf12..12d1a6b51dd3442ca89ba7994569ce9e54b0e351 100644
--- a/string/aarch64/strcmp-mte.S
+++ b/string/aarch64/strcmp-mte.S
@@ -51,6 +51,8 @@
 
 
 ENTRY (__strcmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	sub	off2, src2, src1
 	mov	zeroones, REP8_01
 	and	tmp, src1, 7
@@ -99,6 +101,8 @@ L(end):
 	sub	result, data1, data2, lsr 56
 	ret
 
+	.p2align 4
+
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
@@ -127,17 +131,18 @@ L(do_misaligned):
 	b.ne	L(do_misaligned)
 
 L(src1_aligned):
-	lsl	shift, src2, 3
+	neg	shift, src2, lsl 3
 	bic	src2, src2, 7
 	ldr	data3, [src2], 8
 #ifdef __AARCH64EB__
 	rev	data3, data3
 #endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
 	sub	has_nul, data3, zeroones
 	orr	tmp, data3, REP8_7f
-	bic	has_nul, has_nul, tmp
-	lsr	tmp, has_nul, shift
-	cbnz	tmp, L(tail)
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
 
 	sub	off1, src2, src1
 
@@ -156,8 +161,7 @@ L(loop_unaligned):
 	ccmp	data1, data2, 0, eq
 	b.eq	L(loop_unaligned)
 
-	neg	tmp, shift
-	lsl	tmp, has_nul, tmp
+	lsl	tmp, has_nul, shift
 #ifdef __AARCH64EB__
 	rev	tmp, tmp
 #endif
@@ -166,6 +170,7 @@ L(loop_unaligned):
 	cbnz	syndrome, L(end)
 L(tail):
 	ldr	data1, [src1]
+	neg	shift, shift
 	lsr	data2, data3, shift
 	lsr	has_nul, has_nul, shift
 #ifdef __AARCH64EB__
@@ -180,6 +185,5 @@ L(done):
 	sub	result, data1, data2
 	ret
 
-
 END (__strcmp_aarch64_mte)
 
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index dc5b76923f4be7d95f04d4e683ab2b1373e88350..e6d2da5411cac58a14b62d4767022a0c22b87ecc 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,16 +14,15 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strcmp_aarch64_sve, 4)
+ENTRY (__strcmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p1/z, [x0, x2]
 	ldff1b	z1.b, p1/z, [x1, x2]
 	rdffrs	p0.b, p1/z
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index ee95958d97a396d2e198ee6d5a7e7a5de28035ff..7714ebf5577d84a279f911914f5f7f28d41f3e8c 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -37,6 +37,8 @@
 
 	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
index 7c8629e07b93725e24c6669e992470beb8486700..88c222d61e53ad6841b10ef2b874852df203d800 100644
--- a/string/aarch64/strcpy-mte.S
+++ b/string/aarch64/strcpy-mte.S
@@ -55,6 +55,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index a785d45409a09d8301421360e50dce1633212fea..f515462e09ae768dbc921ba2928150dd5a98c6e7 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,9 +14,6 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.  */
 #ifdef BUILD_STPCPY
 #define FUNC  __stpcpy_aarch64_sve
@@ -24,7 +21,9 @@
 #define FUNC  __strcpy_aarch64_sve
 #endif
 
-ENTRY_ALIGN (FUNC, 4)
+ENTRY (FUNC)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index a6090c8a8d391d97adef41bc031392c4a32299be..6e9ed424b693919e95f7fbe8569fc9024633715a 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,7 +1,7 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2019, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -80,6 +80,8 @@
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 6a993401cee1422a7de205adbb380cb3aea30d9e..7cf41d5c1eac995332ae42bbaf962116eb32457d 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -39,6 +39,7 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strlen_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 9a9a3591c01b163b8184b5064bca59b2689f6fa5..2392493f1a3c4c79b67f790bfa064766253e55e7 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,18 +14,15 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strlen_aarch64_sve, 4)
+ENTRY (__strlen_aarch64_sve)
+	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
-	nop
 
 	/* Read a vector's worth of bytes, stopping on first fault.  */
+	.p2align 4
 0:	ldff1b	z0.b, p2/z, [x0, x1]
-	nop
 	rdffrs	p0.b, p2/z
 	b.nlast	2f
 
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 3aa444bc32fad1977da26c8aae4b312b18de112b..a1b164a49238243419c89a365dd6757f9e9be7cd 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,84 +1,88 @@
 /*
- * strlen - calculate the length of a string
+ * strlen - calculate the length of a string.
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
 #include "../asmdefs.h"
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.	 */
-
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   __strlen_aarch64 will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
+	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
@@ -94,114 +98,103 @@ ENTRY (__strlen_aarch64)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 
-	/* Enter with C = has_nul1 == 0.  */
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-L(page_cross_entry):
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
 	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
+	b.eq	L(loop_entry)
 
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
-#endif
-	sub	len, src, srcin
+	mov	len, 24
 	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
+	mov	tmp3, 16
 	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
+L(loop_entry):
+	bic	src, srcin, 31
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0xf0
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0x0f, lsl 8
 #endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
 
-END (__strlen_aarch64)
+        .p2align 4
 
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
index b7e3914e77dfc008b747915ae182b2bc60ae3554..c9d6fc8a158beca38419a6ccf82cd8573394f7b6 100644
--- a/string/aarch64/strncmp-mte.S
+++ b/string/aarch64/strncmp-mte.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -53,12 +53,10 @@
 #define LS_BK lsl
 #endif
 
-	.text
-	.p2align 6
-	.rep 9
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+ENTRY (__strncmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -70,7 +68,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -83,7 +81,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 L(full_check):
 #ifndef __AARCH64EB__
@@ -167,15 +165,15 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	and	count, count, #0x3f
 	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index fdbe7aea78f910b94429af01bbfe5d0c688c6a2d..234190e245b0ba30f6257fad70b9fcbc4ce767cd 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,10 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strncmp_aarch64_sve, 4)
+ENTRY (__strncmp_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 584c54af8063253ee75618051126ac802037bdc3..738b6539cab647129d801a21bb7b88876b37c070 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,7 +1,7 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -40,12 +40,10 @@
 #define endloop		x15
 #define count		mask
 
-	.text
-	.p2align 6
-	.rep 6
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64, 0)
+ENTRY (__strncmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
@@ -60,7 +58,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -73,7 +71,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 	/* Not reached the limit, must have found the end or a diff.  */
 	tbz	limit_wd, #63, L(not_limit)
@@ -178,7 +176,7 @@ L(mutual_align):
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5ad40d3333aa65284a0cd35a911332669f3bbef4..5b9ebf7763bc2491011641702eac4dbc32f45482 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,7 +1,7 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,9 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
+ENTRY (__strnlen_aarch64_sve)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -66,7 +65,7 @@ ENTRY_ALIGN (__strnlen_aarch64_sve, 4)
 	b	1b
 
 	/* End of count.  Return max.  */
-9:	mov	x0, x2
+9:	mov	x0, x1
 	ret
 
 END (__strnlen_aarch64_sve)
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 4852edca38b2c07ed2f2b4a467931460c57cddef..48d2495d2082be8318c88148eb21d00ee6f0b421 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -42,6 +42,8 @@
    string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strnlen_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 5a409b950b4b508f41f12b982f5b6d0dcf9deed6..1e4fb1a68f7e8bc21a65f5925194f5d188d01e7c 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -44,6 +44,7 @@
    if the relevant byte matched the NUL end of string.  */
 
 ENTRY (__strrchr_aarch64_mte)
+	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	mov	wtmp, 0x3003
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index dbb9bfde98489b407610b674c600ac3571e0a668..d36d69af37fd71a23f656ae0c5bc87f719bd3073 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,7 +1,7 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -14,10 +14,8 @@
  * SVE Available.
  */
 
-	.arch	armv8-a+sve
-	.text
-
-ENTRY_ALIGN (__strrchr_aarch64_sve, 4)
+ENTRY (__strrchr_aarch64_sve)
+	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index f3d22d4b4b945d398a653fb4c690657d0e43faee..56185ff534e3915d3ada2c025b2943489b9b2d7b 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,6 +55,7 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 565708cd48f98bab42c5abd75a3f028bd8d775e3..3f1ac4df136fcb556bb5785e493e6d1fe7582d9b 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,7 +1,7 @@
 /*
  * memchr - scan memory for a character
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -26,13 +26,11 @@
 	.arch armv7-a
 
 @ this lets us check a flag in a 00/ff byte easily in either endianness
-#define __memchr_arm memchr
 #ifdef __ARMEB__
 #define CHARTSTMASK(c) 1<<(31-(c*8))
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 46492b539003e7388f79ef23423570a51a8fa9bb..86e64938edb1f9a73a8577411efd516e6ff8ee82 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2013-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -16,8 +16,8 @@
     Unaligned accesses
 
  */
+
 #include "../asmdefs.h"
-#define __memcpy_arm memcpy
 
 	.syntax unified
 	/* This implementation requires ARM state.  */
@@ -124,7 +124,7 @@ ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
-	bge	L(cpy_not_short)
+	bhs	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
@@ -239,10 +239,10 @@ L(cpy_not_short):
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	blt	L(tail63aligned)
+	blo	L(tail63aligned)
 
 	cmp	tmp2, #512
-	bge	L(cpy_body_long)
+	bhs	L(cpy_body_long)
 
 L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
@@ -266,7 +266,7 @@ L(cpy_body_medium):			/* Count in tmp2.  */
 	add	src, src, #64
 	vstr	d1, [dst, #56]
 	add	dst, dst, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	beq	L(done)
 
@@ -312,7 +312,7 @@ L(tail63aligned):			/* Count in tmp2.  */
 	ldrd	A_l, A_h, [src, #64]!
 	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
-	bge	1b
+	bhs	1b
 	tst	tmp2, #0x3f
 	bne	1f
 	ldr	tmp2,[sp], #FRAME_SIZE
@@ -383,7 +383,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
-	blt	2f
+	blo	2f
 1:
 	cpy_line_vfp	d3, 0
 	cpy_line_vfp	d4, 64
@@ -395,7 +395,7 @@ L(cpy_body_long):			/* Count in tmp2.  */
 	add	dst, dst, #2 * 64
 	add	src, src, #2 * 64
 	subs	tmp2, tmp2, #prefetch_lines * 64
-	bge	1b
+	bhs	1b
 
 2:
 	cpy_tail_vfp	d3, 0
@@ -499,15 +499,15 @@ L(cpy_notaligned):
 1:
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
-	ldrmi	tmp2, [sp], #FRAME_SIZE
-	bmi	L(tail63unaligned)
+	ldrlo	tmp2, [sp], #FRAME_SIZE
+	blo	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
 	vld1.8	{d0-d3}, [src]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bmi	2f
+	blo	2f
 1:
 	pld	[src, #(4 * 64)]
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
@@ -515,7 +515,7 @@ L(cpy_notaligned):
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
 	vld1.8	{d4-d7}, [src]!
 	subs	count, count, #64
-	bpl	1b
+	bhs	1b
 2:
 	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
 	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 3ee523814e9559ff7d442affa6b2637d0524cc95..11e927368fd196ed1825549e3757e6a3e58538dd 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -25,7 +25,6 @@
 #else
 #define CHARTSTMASK(c) 1<<(c*8)
 #endif
-	.text
 	.thumb
 
 @ ---------------------------------------------------------------------------
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 3e54519cc3354093cfc300e107113a3b1b89555d..b75d4143db5747b61e388aae06d714f2df0b9fc7 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv6-M (optimized for performance, not size)
  *
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 586c14daa6e76444c825f6171c10141a404219f7..51443e3430587eb04dd85354e266d8c3764b5519 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,7 +1,7 @@
 /*
  * strcmp for ARMv7
  *
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -11,7 +11,7 @@
    available.  Use ldrd to support wider loads, provided the data
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
-#define __strcmp_arm strcmp
+
 #include "../asmdefs.h"
 
 /* Build Options:
@@ -125,7 +125,6 @@
 #endif
 	.endm
 
-	.text
 	.p2align	5
 L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 25548108d98e583e16cff0630d781f6fc5059de5..02cf94ff4be009c2048886c0fedd59f521d73f56 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -1,10 +1,11 @@
 /*
  * strcpy
  *
- * Copyright (c) 2008-2019, Arm Limited.
+ * Copyright (c) 2008-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
+#if defined (__thumb2__) && !defined (__thumb__)
 
 /* For GLIBC:
 #include <string.h>
@@ -12,7 +13,7 @@
 
 #undef strcmp
 */
-#define __strcpy_arm strcpy
+
 #ifdef __thumb2__
 #define magic1(REG) "#0x01010101"
 #define magic2(REG) "#0x80808080"
@@ -111,13 +112,8 @@ __strcpy_arm (char* dst, const char* src)
 # else
        "tst	r2, #0xff\n\t"
        "itet	ne\n\t"
-# ifdef __clang__
-       "strhne	r2, [ip], #2\n\t"
-       "strbeq	r2, [ip]\n\t"
-# else
        "strneh	r2, [ip], #2\n\t"
        "streqb	r2, [ip]\n\t"
-# endif
        "tstne	r2, #0xff00\n\t"
 # endif
        "bne	5b\n\t"
@@ -133,3 +129,5 @@ __strcpy_arm (char* dst, const char* src)
        "BX LR");
 }
 /* For GLIBC: libc_hidden_builtin_def (strcpy) */
+
+#endif /* defined (__thumb2__) && !defined (__thumb__)  */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 046148a3a3a55d5df5012123509c281540d32d30..5ad30c941586286c850cffd8be229fa7600df7f9 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,7 +1,7 @@
 /*
  * strlen - calculate the length of a string
  *
- * Copyright (c) 2010, Arm Limited.
+ * Copyright (c) 2010-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -12,7 +12,7 @@
    ARMv6T2, AArch32
 
  */
-#define __strlen_armv6t2 strlen
+
 #include "../asmdefs.h"
 
 #ifdef __ARMEB__
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 31c0f9dc348d9dc520cd5eaf228761fb4c25a878..340b427a505bb4e9f9f0f3b00ebbfd2b52f887e6 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -1,7 +1,7 @@
 /*
  * Macros for asm code.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -81,4 +81,18 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 
 #define L(l) .L ## l
 
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
 #endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 967507bb4d3fd73760fc11ec86629325fc7cdc44..d5d4ea7e0309a0a9e00dca54048cbb8dc7bb4c00 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -221,6 +221,40 @@ int main (void)
       printf ("\n");
     }
 
+  printf ("\nUnaligned forwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, a + 256 + (i & 31), size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+
+  printf ("\nUnaligned backwards memmove:\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s (B/ns) ", funtab[f].name);
+
+      for (int size = 1024; size <= 32768; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a + 256 + (i & 31), a, size);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
 
   return 0;
 }
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc0f04bee5471a4c623e047f773bde10f0e8aac7
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+  const char *name;
+  size_t (*fun) (const char *s);
+  int test_mte;
+} funtab[] = {
+  // clang-format off
+  F(strlen, 0)
+#if __aarch64__
+  F(__strlen_aarch64, 0)
+  F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+  F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+  F(__strlen_armv6t2, 0)
+# endif
+#endif
+  {0, 0, 0}
+  // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017.  */
+static freq_data_t strlen_len_freq[] =
+{
+  { 12,22671}, { 18,12834}, { 13, 9555}, {  6, 6348}, { 17, 6095}, { 11, 2115},
+  { 10, 1335}, {  7,  814}, {  2,  646}, {  9,  483}, {  8,  471}, { 16,  418},
+  {  4,  390}, {  1,  388}, {  5,  233}, {  3,  204}, {  0,   79}, { 14,   79},
+  { 15,   69}, { 26,   36}, { 22,   35}, { 31,   24}, { 32,   24}, { 19,   21},
+  { 25,   17}, { 28,   15}, { 21,   14}, { 33,   14}, { 20,   13}, { 24,    9},
+  { 29,    9}, { 30,    9}, { 23,    7}, { 34,    7}, { 27,    6}, { 44,    5},
+  { 42,    4}, { 45,    3}, { 47,    3}, { 40,    2}, { 41,    2}, { 43,    2},
+  { 58,    2}, { 78,    2}, { 36,    2}, { 48,    1}, { 52,    1}, { 60,    1},
+  { 64,    1}, { 56,    1}, { 76,    1}, { 68,    1}, { 80,    1}, { 84,    1},
+  { 72,    1}, { 86,    1}, { 35,    1}, { 39,    1}, { 50,    1}, { 38,    1},
+  { 37,    1}, { 46,    1}, { 98,    1}, {102,    1}, {128,    1}, { 51,    1},
+  {107,    1}, { 0,     0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017.  */
+static align_data_t string_align_freq[] =
+{
+  {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+      strlen_len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+      strlen_align_arr[n++] = size;
+  assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+  uint16_t index[MAX_ALIGN];
+
+  memset (a, 'x', sizeof (a));
+
+  /* Create indices for strings at all alignments.  */
+  for (int i = 0; i < MAX_ALIGN; i++)
+    {
+      index[i] = i * (MAX_STRLEN + 1);
+      a[index[i] + MAX_STRLEN] = 0;
+    }
+
+  /* Create a random set of strlen input strings using the string length
+     and alignment distributions.  */
+  for (int n = 0; n < NUM_STRLEN; n++)
+    {
+      int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+      int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+      strlen_tests[n] =
+	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+    }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+  rand32 (0x12345678);
+  init_strlen_distribution ();
+  init_strlen_tests ();
+
+  printf ("\nRandom strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t res = 0, strlen_size = 0, mask = maskv;
+      printf ("%22s ", funtab[f].name);
+
+      for (int c = 0; c < NUM_STRLEN; c++)
+	strlen_size += funtab[f].fun (a + strlen_tests[c]);
+      strlen_size *= ITERS;
+
+      /* Measure latency of strlen result with (res & mask).  */
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_STRLEN; c++)
+	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+      t = clock_get_ns () - t;
+      printf ("%.2f\n", (double)strlen_size / t);
+    }
+
+  printf ("\nSmall aligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nSmall unaligned strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      int align = 9;
+      for (int size = 1; size <= 64; size *= 2)
+	{
+	  memset (a + align, 'x', size);
+	  a[align + size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a + align);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\nMedium strlen (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 128; size <= 4096; size *= 2)
+	{
+	  memset (a, 'x', size);
+	  a[size - 1] = 0;
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a);
+	  t = clock_get_ns () - t;
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("\n");
+
+  return 0;
+}
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 67b0dbfc3e436df2cc3f7b00d109aa6d36fd4f5f..378c3cd2d64590c05aa1cb80f6ba2559be017d2d 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,7 +1,7 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -54,12 +54,11 @@ size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
+# if __ARM_FEATURE_MEMORY_TAGGING
+void *__mtag_tag_region (void *, size_t);
+void *__mtag_tag_zero_region (void *, size_t);
+# endif
 #elif __arm__
-#define __memcpy_arm memcpy
-#define __memchr_arm memchr
-#define __strcpy_arm strcpy
-#define __strcmp_arm strcmp
-#define __strlen_armv6t2 strlen
 void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
 void *__memset_arm (void *, int, size_t);
 void *__memchr_arm (const void *, int, size_t);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
new file mode 100644
index 0000000000000000000000000000000000000000..d8c02d92d626a6e754b756cdcb17945e6a6a14ad
--- /dev/null
+++ b/string/test/__mtag_tag_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a';
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 'a')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
new file mode 100644
index 0000000000000000000000000000000000000000..221c223a2f3105ab02c7b21b9560a81bddf4355d
--- /dev/null
+++ b/string/test/__mtag_tag_zero_region.c
@@ -0,0 +1,147 @@
+/*
+ * __mtag_tag_zero_region test.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mte.h"
+#include "stringlib.h"
+#include "stringtest.h"
+
+static void
+mtag_quoteat (const char *prefix, void *p, int len, int at)
+{
+  /* Print tag, untag and quote the context.  */
+  printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
+  untag_buffer (p, len, 1);
+  p = untag_pointer (p);
+  quoteat (prefix, p, len, at);
+}
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun) (void *s, size_t n);
+} funtab[] = {
+// clang-format off
+#if __aarch64__
+  F(__mtag_tag_zero_region)
+#endif
+  {0, 0}
+  // clang-format on
+};
+#undef F
+
+#define A 64
+#define LEN 250000
+static unsigned char *sbuf;
+
+static void *
+alignup (void *p)
+{
+  return (void *) (((uintptr_t) p + A - 1) & -A);
+}
+
+static void
+test (const struct fun *fun, int salign, int len)
+{
+  unsigned char *src = alignup (sbuf);
+  unsigned char *s = src + salign;
+  void *p;
+  int i;
+
+  if (err_count >= ERR_LIMIT)
+    return;
+  if (len > LEN || salign >= A)
+    abort ();
+  for (i = 0; i < len + 2 * A; i++)
+    src[i] = '?';
+  for (i = 0; i < len; i++)
+    s[i] = 'a' + i % 23;
+
+  src = tag_buffer (src, len + 2 * A, 1);
+  s = src + salign;
+  /* Use different tag.  */
+  s = __arm_mte_increment_tag (s, 1);
+  p = fun->fun (s, len);
+
+  if (p != s)
+    ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+
+  for (i = 0; i < salign; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got head", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < salign + len; i++)
+    {
+      if (s[i - salign] != 0)
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got body", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  for (; i < len + 2 * A; i++)
+    {
+      if (src[i] != '?')
+	{
+	  ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
+	  mtag_quoteat ("got tail", src, len + 2 * A, i);
+	  return;
+	}
+    }
+
+  untag_buffer (src, len + 2 * A, 1);
+}
+
+int
+main ()
+{
+  if (!mte_enabled ())
+    return 0;
+
+  sbuf = mte_mmap (LEN + 3 * A);
+  int r = 0;
+  for (int i = 0; funtab[i].name; i++)
+    {
+      err_count = 0;
+      for (int s = 0; s < A; s += 16)
+	{
+	  int n;
+	  for (n = 0; n < 200; n += 16)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	  for (; n < LEN; n *= 2)
+	    {
+	      test (funtab + i, s, n);
+	    }
+	}
+      printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
+      if (err_count)
+	r = -1;
+    }
+  return r;
+}
+#else
+int
+main ()
+{
+  return 0;
+}
+#endif
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index dd936987b430199bf7f37b0c32f7c7a1c7e4e81d..7a7cf9cff35af2c22248dfd21609b7e83af68976 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -1,7 +1,7 @@
 /*
  * memcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 346d92048ad6ffe4f484266d8dea3af9ca560a59..ce0ceeef5ee844e5feadaf2cb18020436e1e9b12 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,7 +1,7 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memmove.c b/string/test/memmove.c
index af92fe303814bd704f4bd9f8f457f112d52a34e6..689b68c98af264c8d5e485e7134a0f216fce555c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,7 +1,7 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/memset.c b/string/test/memset.c
index cebe9ada293562dcf1836b8795efd72cba67492c..f1721442dbaf83f682859526632655c7ad65cd75 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -1,7 +1,7 @@
 /*
  * memset test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 4e718e3664d4876a7e68b9109ace560822b1050b..d57b54ed50a8a5e8b742805444510ec98a62851d 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,7 +1,7 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 23fbb0a4c8a63cbe51a3ec907f506ad52e04ff94..018a8a431ab8ca55110b814e0e089fde6f199772 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,7 +1,7 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index b9684572f220ff76399edfedefc429f91cf5f2fb..fedbdc52fcc1151ffbbd168ef3bd1cb42c700ff0 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -1,7 +1,7 @@
 /*
  * strrchr test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2021, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -91,7 +91,7 @@ test (const struct fun *fun, int align, int seekpos, int len)
   if (p != s + len)
     {
       ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
-	   fun->name, s, 0, len, p, f, len);
+	   fun->name, s, 0, len, p, s + len, len);
       quote ("input", s, len);
     }
 }