From fba0aecf8a7b94b77225555e041113846d85aad2 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Mon, 24 Apr 2023 19:50:43 +0800 Subject: [PATCH] grep:pcre use UCP in UTF mode --- backport-pcre-use-UCP-in-UTF-mode.patch | 109 ++++++++++++++++++++++++ grep.spec | 7 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 backport-pcre-use-UCP-in-UTF-mode.patch diff --git a/backport-pcre-use-UCP-in-UTF-mode.patch b/backport-pcre-use-UCP-in-UTF-mode.patch new file mode 100644 index 0000000..7eca2a9 --- /dev/null +++ b/backport-pcre-use-UCP-in-UTF-mode.patch @@ -0,0 +1,109 @@ +From 5e3b760f65f13856e5717e5b9d935f5b4a615be3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= +Date: Fri, 6 Jan 2023 19:34:56 -0800 +Subject: [PATCH] pcre: use UCP in UTF mode + +This fixes a serious bug affecting word-boundary and word-constituent regular +expressions when the desired match involves non-ASCII UTF8 characters. +* src/pcresearch.c: Set PCRE2_UCP together with PCRE2_UTF +* tests/pcre-utf8-w: New file. +* tests/Makefile.am (TESTS): Add it. +* NEWS (Bug fixes): Mention this. +* THANKS.in: Add Gro-Tsen and Karl Petterson. +Reported by Gro-Tsen https://twitter.com/gro_tsen/status/1610972356972875777 +via Karl Pettersson in https://github.com/PCRE2Project/pcre2/issues/185 +This bug was present from grep-2.5, when --perl-regexp (-P) support was added. + +Reference:https://git.savannah.gnu.org/cgit/grep.git/commit?id=5e3b760f65f13856e5717e5b9d935f5b4a615be3 +Conflict:delete NEWS and change src/pcresearch.c +--- + THANKS.in | 2 ++ + src/pcresearch.c | 2 +- + tests/Makefile.am | 1 + + tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++ + 4 files changed, 32 insertions(+), 1 deletion(-) + create mode 100755 tests/pcre-utf8-w + +diff --git a/THANKS.in b/THANKS.in +index 9872bfa..d0d6f92 100644 +--- a/THANKS.in ++++ b/THANKS.in +@@ -35,6 +35,7 @@ Gerald Stoller gerald_stoller@hotmail.com + Grant McDorman grant@isgtec.com + Greg Boyd gboyd.ccsf@gmail.com + Greg Louis glouis@dynamicro.on.ca ++Gro-Tsen https://twitter.com/gro_tsen + Guglielmo 'bond' Bondioni g.bondioni@libero.it + H. Merijn Brand h.m.brand@hccnet.nl + Harald Hanche-Olsen hanche@math.ntnu.no +@@ -50,6 +51,7 @@ Joel N. Weber II devnull@gnu.org + John Hughes john@nitelite.calvacom.fr + Jorge Stolfi stolfi@dcc.unicamp.br + Karl Heuer kwzh@gnu.org ++Karl Petterson karl.pettersson@klpn.se + Kaveh R. Ghazi ghazi@caip.rutgers.edu + Kazuro Furukawa furukawa@apricot.kek.jp + Keith Bostic bostic@bsdi.com +diff --git a/src/pcresearch.c b/src/pcresearch.c +index a107f4d..45b67ee 100644 +--- a/src/pcresearch.c ++++ b/src/pcresearch.c +@@ -141,7 +141,7 @@ Pcompile (char *pattern, size_t size, reg_syntax_t ignored, bool exact) + { + if (! localeinfo.using_utf8) + die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); +- flags |= PCRE_UTF8; ++ flags |= (PCRE_UTF8 | PCRE_UCP); + } + + /* FIXME: Remove this restriction. */ +diff --git a/tests/Makefile.am b/tests/Makefile.am +index e0b0503..a47cf5c 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -147,6 +147,7 @@ TESTS = \ + pcre-jitstack \ + pcre-o \ + pcre-utf8 \ ++ pcre-utf8-w \ + pcre-w \ + pcre-wx-backref \ + pcre-z \ +diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w +new file mode 100755 +index 0000000..4cd7db6 +--- /dev/null ++++ b/tests/pcre-utf8-w +@@ -0,0 +1,28 @@ ++#!/bin/sh ++# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent ++# ++# Copyright (C) 2023 Free Software Foundation, Inc. ++# ++# Copying and distribution of this file, with or without modification, ++# are permitted in any medium without royalty provided the copyright ++# notice and this notice are preserved. ++ ++. "${srcdir=.}/init.sh"; path_prepend_ ../src ++require_en_utf8_locale_ ++LC_ALL=en_US.UTF-8 ++export LC_ALL ++require_pcre_ ++ ++fail=0 ++ ++echo 'Perú'> in || framework_failure_ ++ ++echo 'ú' > exp || framework_failure_ ++grep -Po '.\b' in > out || fail=1 ++compare exp out || fail=1 ++ ++echo 'rú' > exp || framework_failure_ ++grep -Po 'r\w' in > out || fail=1 ++compare exp out || fail=1 ++ ++Exit $fail +-- +2.33.0 + + diff --git a/grep.spec b/grep.spec index 0f07167..c4689d9 100644 --- a/grep.spec +++ b/grep.spec @@ -1,6 +1,6 @@ Name: grep Version: 3.7 -Release: 5 +Release: 6 Summary: A string search utility License: GPLv3+ URL: http://www.gnu.org/software/grep/ @@ -12,6 +12,7 @@ Source3: grepconf.sh Patch1: backport-grep-avoid-sticky-problem-with-f-f.patch Patch2: backport-grep-s-does-not-suppress-binary-file-matches.patch Patch3: backport-grep-bug-backref-in-last-of-multiple-patter.patch +Patch4: backport-pcre-use-UCP-in-UTF-mode.patch BuildRequires: gcc pcre-devel >= 3.9-10 texinfo gettext libsigsegv-devel automake Provides: /bin/egrep /bin/fgrep /bin/grep bundled(gnulib) @@ -24,6 +25,7 @@ a specified pattern. By default, Grep outputs the matching lines. %autosetup -n %{name}-%{version} -p1 %build +autoreconf %configure --disable-silent-rules \ CPPFLAGS="-I%{_includedir}/pcre" CFLAGS="$RPM_OPT_FLAGS -fsigned-char" %make_build @@ -56,6 +58,9 @@ make check %changelog +* Mon Apr 24 2023 gaoruoshu - 3.7-6 +- pcre: use UCP in UTF mode + * Mon Dec 26 2022 gaoruoshu - 3.7-5 - backport patch from upstream -- Gitee