From bf59b0c54f559179c6773dd4697e9d88a2021503 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Mon, 24 Apr 2023 19:25:21 +0800 Subject: [PATCH] grep:pcre use UCP in UTF mode --- backport-pcre-use-UCP-in-UTF-mode.patch | 88 +++++++++++++++++++++++++ grep.spec | 8 ++- 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 backport-pcre-use-UCP-in-UTF-mode.patch diff --git a/backport-pcre-use-UCP-in-UTF-mode.patch b/backport-pcre-use-UCP-in-UTF-mode.patch new file mode 100644 index 0000000..706034d --- /dev/null +++ b/backport-pcre-use-UCP-in-UTF-mode.patch @@ -0,0 +1,88 @@ +From 5e3b760f65f13856e5717e5b9d935f5b4a615be3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= +Date: Fri, 6 Jan 2023 19:34:56 -0800 +Subject: [PATCH] pcre: use UCP in UTF mode + +This fixes a serious bug affecting word-boundary and word-constituent regular +expressions when the desired match involves non-ASCII UTF8 characters. +* src/pcresearch.c: Set PCRE2_UCP together with PCRE2_UTF +* tests/pcre-utf8-w: New file. +* tests/Makefile.am (TESTS): Add it. +* NEWS (Bug fixes): Mention this. +* THANKS.in: Add Gro-Tsen and Karl Petterson. +Reported by Gro-Tsen https://twitter.com/gro_tsen/status/1610972356972875777 +via Karl Pettersson in https://github.com/PCRE2Project/pcre2/issues/185 +This bug was present from grep-2.5, when --perl-regexp (-P) support was added. + +Reference:https://git.savannah.gnu.org/cgit/grep.git/commit?id=5e3b760f65f13856e5717e5b9d935f5b4a615be3 +Conflict:delete NEWS,ThANKS.in and change src/pcresearch.c +--- + src/pcresearch.c | 2 +- + tests/Makefile.am | 1 + + tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++ + 3 files changed, 30 insertions(+), 1 deletion(-) + create mode 100755 tests/pcre-utf8-w + +diff --git a/src/pcresearch.c b/src/pcresearch.c +index 577995f..0127073 100644 +--- a/src/pcresearch.c ++++ b/src/pcresearch.c +@@ -136,7 +136,7 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact) + { + if (! localeinfo.using_utf8) + die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); +- flags |= PCRE_UTF8; ++ flags |= (PCRE_UTF8 | PCRE_UCP); + } + + /* FIXME: Remove this restriction. */ +diff --git a/tests/Makefile.am b/tests/Makefile.am +index b05a126..d2968c6 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -143,6 +143,7 @@ TESTS = \ + pcre-jitstack \ + pcre-o \ + pcre-utf8 \ ++ pcre-utf8-w \ + pcre-w \ + pcre-wx-backref \ + pcre-z \ +diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w +new file mode 100755 +index 0000000..4cd7db6 +--- /dev/null ++++ b/tests/pcre-utf8-w +@@ -0,0 +1,28 @@ ++#!/bin/sh ++# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent ++# ++# Copyright (C) 2023 Free Software Foundation, Inc. ++# ++# Copying and distribution of this file, with or without modification, ++# are permitted in any medium without royalty provided the copyright ++# notice and this notice are preserved. ++ ++. "${srcdir=.}/init.sh"; path_prepend_ ../src ++require_en_utf8_locale_ ++LC_ALL=en_US.UTF-8 ++export LC_ALL ++require_pcre_ ++ ++fail=0 ++ ++echo 'Perú'> in || framework_failure_ ++ ++echo 'ú' > exp || framework_failure_ ++grep -Po '.\b' in > out || fail=1 ++compare exp out || fail=1 ++ ++echo 'rú' > exp || framework_failure_ ++grep -Po 'r\w' in > out || fail=1 ++compare exp out || fail=1 ++ ++Exit $fail +-- +2.27.0 + + diff --git a/grep.spec b/grep.spec index beaa8d9..03444be 100644 --- a/grep.spec +++ b/grep.spec @@ -1,6 +1,6 @@ Name: grep Version: 3.4 -Release: 2 +Release: 3 Summary: A string search utility License: GPLv3+ URL: http://www.gnu.org/software/grep/ @@ -10,6 +10,7 @@ Source2: colorgrep.csh Source3: grepconf.sh Patch1: backport-grep-avoid-sticky-problem-with-f-f.patch +Patch6001: backport-pcre-use-UCP-in-UTF-mode.patch BuildRequires: gcc git pcre-devel >= 3.9-10 texinfo gettext Provides: /bin/egrep /bin/fgrep /bin/grep bundled(gnulib) @@ -22,6 +23,7 @@ a specified pattern. By default, Grep outputs the matching lines. %autosetup -n %{name}-%{version} -p1 %build +autoreconf %configure --without-included-regex --disable-silent-rules \ CPPFLAGS="-I%{_includedir}/pcre" CFLAGS="$RPM_OPT_FLAGS" %make_build @@ -54,6 +56,10 @@ make check %changelog +* Mon Apr 24 2023 gaoruoshu - 3.4-3 +- Type:bugfix +- DESC:pcre: use UCP in UTF mode + * Tue Dec 20 2022 gaoruoshu - 3.4-2 - Type:bugfix - DESC:Added coloring aliases to fgrep egrep and grep -- Gitee