diff --git a/backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch b/backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch new file mode 100644 index 0000000000000000000000000000000000000000..b11db588823d0349947abbc9f1123fbb6ba5798e --- /dev/null +++ b/backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch @@ -0,0 +1,318 @@ +From 16d7edb56757e5294eeeecc9a19135aab89a50ba Mon Sep 17 00:00:00 2001 +From: Nicholas Wilson +Date: Fri, 1 Nov 2024 17:13:34 +0000 +Subject: [PATCH] Fix the lookahead after [\d or [[:posix] to skip whitespace + (#544) + +Conflict:don't modify alt_extended_class because fc38d9e784 is not merged; +don't modify class_op_state because class_op_state is not merged; adapt context +Reference:https://github.com/PCRE2Project/pcre2/commit/16d7edb56757e5294eeeecc9a19135aab89a50ba + +--- + src/pcre2_compile.c | 88 +++++++++++++++++++++++++++--------------- + src/pcre2_intmodedep.h | 2 +- + testdata/testinput1 | 20 +++++++--- + testdata/testinput2 | 8 ++++ + testdata/testoutput1 | 30 ++++++++++---- + testdata/testoutput2 | 12 ++++++ + 6 files changed, 113 insertions(+), 47 deletions(-) + +diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c +index 2493c871..9be26b07 100644 +--- a/src/pcre2_compile.c ++++ b/src/pcre2_compile.c +@@ -2681,7 +2681,14 @@ the main compiling phase. */ + /* States used for analyzing ranges in character classes. The two OK values + must be last. */ + +-enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; ++enum { ++ RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */ ++ RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */ ++ RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */ ++ RANGE_FORBID_STARTED, /* State after '[\d-'*/ ++ RANGE_OK_ESCAPED, /* State after '[1'; hyphen may be a range */ ++ RANGE_OK_LITERAL /* State after '[\1'; hyphen may be a range */ ++}; + + /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates + the storing of literal values in the main parsed pattern, where they can always +@@ -2734,6 +2741,7 @@ PCRE2_SPTR thisptr; + PCRE2_SPTR name; + PCRE2_SPTR ptrend = cb->end_pattern; + PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ ++PCRE2_SPTR class_range_forbid_ptr = NULL; + named_group *ng; + nest_save *top_nest, *end_nests; + +@@ -3559,6 +3567,21 @@ while (ptr < ptrend) + goto FAILED; + } + ++ /* Perl treats a hyphen after a POSIX class as a literal, not the ++ start of a range. However, it gives a warning in its warning mode ++ unless the hyphen is the last character in the class. PCRE does not ++ have a warning mode, so we give an error, because this is likely an ++ error on the user's part. ++ ++ Roll back to the hyphen for the error position. */ ++ ++ if (class_range_state == RANGE_FORBID_STARTED) ++ { ++ ptr = class_range_forbid_ptr; ++ errorcode = ERR50; ++ goto FAILED; ++ } ++ + if (*ptr != CHAR_COLON) + { + errorcode = ERR13; +@@ -3579,26 +3602,12 @@ while (ptr < ptrend) + } + ptr = tempptr + 2; + +- /* Perl treats a hyphen after a POSIX class as a literal, not the +- start of a range. However, it gives a warning in its warning mode +- unless the hyphen is the last character in the class. PCRE does not +- have a warning mode, so we give an error, because this is likely an +- error on the user's part. */ +- +- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && +- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) +- { +- errorcode = ERR50; +- goto FAILED; +- } +- +- /* Set "a hyphen is not the start of a range" for the -] case, and also +- in case the POSIX class is followed by \E or \Q\E (possibly repeated - +- fuzzers do that kind of thing) and *then* a hyphen. This causes that +- hyphen to be treated as a literal. I don't think it's worth setting up +- special apparatus to do otherwise. */ ++ /* Set "a hyphen is forbidden to be the start of a range". For the '-]' ++ case, the hyphen is treated as a literal, but for '-1' it is disallowed ++ (because it would be interpreted as range). */ + +- class_range_state = RANGE_NO; ++ class_range_state = RANGE_FORBID_NO; ++ class_range_forbid_ptr = ptr; + + /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some + of the POSIX classes are converted to use Unicode properties \p or \P +@@ -3648,6 +3657,14 @@ while (ptr < ptrend) + class_range_state = RANGE_STARTED; + } + ++ /* Handle forbidden start of range */ ++ ++ else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO) ++ { ++ *parsed_pattern++ = CHAR_MINUS; ++ class_range_state = RANGE_FORBID_STARTED; ++ } ++ + /* Handle a literal character */ + + else if (c != CHAR_BACKSLASH) +@@ -3670,6 +3687,12 @@ while (ptr < ptrend) + } + class_range_state = RANGE_NO; + } ++ else if (class_range_state == RANGE_FORBID_STARTED) ++ { ++ ptr = class_range_forbid_ptr; ++ errorcode = ERR50; ++ goto FAILED; ++ } + else /* Potential start of range */ + { + class_range_state = char_is_literal? +@@ -3733,13 +3756,23 @@ while (ptr < ptrend) + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; +- goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */ ++ goto FAILED; ++ } ++ /* Perl gives a warning unless the hyphen following a multi-character ++ escape is the last character in the class. PCRE throws an error. */ ++ if (class_range_state == RANGE_FORBID_STARTED) ++ { ++ ptr = class_range_forbid_ptr; ++ errorcode = ERR50; ++ goto FAILED; + } + + /* Of the remaining escapes, only those that define characters are + allowed in a class. None may start a range. */ + +- class_range_state = RANGE_NO; ++ class_range_state = RANGE_FORBID_NO; ++ class_range_forbid_ptr = ptr; ++ + switch(escape) + { + case ESC_N: +@@ -3779,6 +3812,7 @@ while (ptr < ptrend) + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; ++ class_range_forbid_ptr = ptr; + } + #else + errorcode = ERR45; +@@ -3791,16 +3825,6 @@ while (ptr < ptrend) + ptr--; + goto FAILED; + } +- +- /* Perl gives a warning unless a following hyphen is the last character +- in the class. PCRE throws an error. */ +- +- if (ptr < ptrend - 1 && *ptr == CHAR_MINUS && +- ptr[1] != CHAR_RIGHT_SQUARE_BRACKET) +- { +- errorcode = ERR50; +- goto FAILED; +- } + } + + /* Proceed to next thing in the class. */ +diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h +index 598060c9..a11b4faa 100644 +--- a/src/pcre2_intmodedep.h ++++ b/src/pcre2_intmodedep.h +@@ -435,7 +435,7 @@ UTF-16 mode. */ + c = *eptr; \ + if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); + +-/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the ++/* Get the next UTF-16 character, testing for UTF-16 mode, not advancing the + pointer, incrementing length if there is a low surrogate. This is called when + we do not know if we are in UTF-16 mode. */ + +diff --git a/testdata/testinput1 b/testdata/testinput1 +index 0794502e..1e50369f 100644 +--- a/testdata/testinput1 ++++ b/testdata/testinput1 +@@ -5787,12 +5787,6 @@ ef) x/x,mark + + /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ + +-/[s[:digit:]\E-H]+/ +- s09-H +- +-/[s[:digit:]\Q\E-H]+/ +- s09-H +- + /a+(?:|b)a/ + aaaa + +@@ -6435,4 +6429,18 @@ ef) x/x,mark + /(a\K.(?1)*)/ + abac + ++/[[:digit:]- ]/xx ++ 1 ++ - ++\= Expect no match ++ z ++ \ \ ++ ++/[\d- ]/xx ++ 1 ++ - ++\= Expect no match ++ z ++ \ \ ++ + # End of testinput1 +diff --git a/testdata/testinput2 b/testdata/testinput2 +index b6464a0b..61b94e69 100644 +--- a/testdata/testinput2 ++++ b/testdata/testinput2 +@@ -5981,4 +5981,12 @@ a)"xI + a + a\=noteol + ++/[[:digit:] -Z]/xx ++ ++/[\d -Z]/xx ++ ++/[[:digit:]\E-H]/ ++ ++/[[:digit:]\Q\E-H]+/ ++ + # End of testinput2 +diff --git a/testdata/testoutput1 b/testdata/testoutput1 +index 8daf8362..6f927729 100644 +--- a/testdata/testoutput1 ++++ b/testdata/testoutput1 +@@ -9246,14 +9246,6 @@ No match + + /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ + +-/[s[:digit:]\E-H]+/ +- s09-H +- 0: s09-H +- +-/[s[:digit:]\Q\E-H]+/ +- s09-H +- 0: s09-H +- + /a+(?:|b)a/ + aaaa + 0: aaaa +@@ -10197,4 +10189,26 @@ No match + 0: c + 1: abac + ++/[[:digit:]- ]/xx ++ 1 ++ 0: 1 ++ - ++ 0: - ++\= Expect no match ++ z ++No match ++ \ \ ++No match ++ ++/[\d- ]/xx ++ 1 ++ 0: 1 ++ - ++ 0: - ++\= Expect no match ++ z ++No match ++ \ \ ++No match ++ + # End of testinput1 +diff --git a/testdata/testoutput2 b/testdata/testoutput2 +index 1075b4d4..86bfe964 100644 +--- a/testdata/testoutput2 ++++ b/testdata/testoutput2 +@@ -17815,6 +17815,18 @@ Subject length lower bound = 2 + a\=noteol + 0: a + ++/[[:digit:] -Z]/xx ++Failed: error 150 at offset 10: invalid range in character class ++ ++/[\d -Z]/xx ++Failed: error 150 at offset 3: invalid range in character class ++ ++/[[:digit:]\E-H]/ ++Failed: error 150 at offset 10: invalid range in character class ++ ++/[[:digit:]\Q\E-H]+/ ++Failed: error 150 at offset 10: invalid range in character class ++ + # End of testinput2 + Error -70: PCRE2_ERROR_BADDATA (unknown error number) + Error -62: bad serialized data +-- +2.33.0 + diff --git a/backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch b/backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch new file mode 100644 index 0000000000000000000000000000000000000000..b4847fb3a48a2e3342c59195597aae7c89438525 --- /dev/null +++ b/backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch @@ -0,0 +1,104 @@ +From fc56fd790c1a3ba8f2890fc2b6afba21250923de Mon Sep 17 00:00:00 2001 +From: Philip Hazel +Date: Thu, 2 Feb 2023 17:19:45 +0000 +Subject: [PATCH] Further ASCII tests and minor bugfix plus ChangeLog update + +Conflict:don't modify ChangeLog +Reference:https://github.com/PCRE2Project/pcre2/commit/fc56fd790c1a3ba8f2890fc2b6afba21250923de + +--- + src/pcre2_compile.c | 5 ++--- + testdata/testinput5 | 5 +++++ + testdata/testinput7 | 5 +++++ + testdata/testoutput5 | 7 +++++++ + testdata/testoutput7 | 7 +++++++ + 5 files changed, 26 insertions(+), 3 deletions(-) + +diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c +index b8a9e098..64a35bda 100644 +--- a/src/pcre2_compile.c ++++ b/src/pcre2_compile.c +@@ -2660,10 +2660,9 @@ the main compiling phase. */ + PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_UNGREEDY) + +-#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT) +- + #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \ +- PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW) ++ PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \ ++ PCRE2_EXTRA_ASCII_POSIX) + + /* States used for analyzing ranges in character classes. The two OK values + must be last. */ +diff --git a/testdata/testinput5 b/testdata/testinput5 +index 6e186cf0..49b46f82 100644 +--- a/testdata/testinput5 ++++ b/testdata/testinput5 +@@ -2434,6 +2434,11 @@ + /(?aP)[[:alnum:]\d]+/i,ucp,utf + abc\x{660}xyz + ++/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/ ++ \x{660}A\x{660} ++\= Expect no match ++ \x{660}\x{660}\x{660} ++ + # VARIOUS + + /[\d\s\w]+/a,ucp,utf +diff --git a/testdata/testinput7 b/testdata/testinput7 +index 64a37ad2..a2b7fb8d 100644 +--- a/testdata/testinput7 ++++ b/testdata/testinput7 +@@ -2453,6 +2453,11 @@ + /(?aP)[[:alnum:]\d]+/i,ucp,utf + abc\x{660}xyz + ++/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/ ++ \x{660}A\x{660} ++\= Expect no match ++ \x{660}\x{660}\x{660} ++ + # VARIOUS + + /[\d\s\w]+/a,ucp,utf +diff --git a/testdata/testoutput5 b/testdata/testoutput5 +index 26972f70..4f845c84 100644 +--- a/testdata/testoutput5 ++++ b/testdata/testoutput5 +@@ -5365,6 +5365,13 @@ No match + abc\x{660}xyz + 0: abc\x{660}xyz + ++/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/ ++ \x{660}A\x{660} ++ 0: \x{660}A\x{660} ++\= Expect no match ++ \x{660}\x{660}\x{660} ++No match ++ + # VARIOUS + + /[\d\s\w]+/a,ucp,utf +diff --git a/testdata/testoutput7 b/testdata/testoutput7 +index c830748c..4065981d 100644 +--- a/testdata/testoutput7 ++++ b/testdata/testoutput7 +@@ -4105,6 +4105,13 @@ No match + abc\x{660}xyz + 0: abc\x{660}xyz + ++/(*UCP)(*UTF)[[:alnum:]](?aP:[[:alnum:]])[[:alnum:]]/ ++ \x{660}A\x{660} ++ 0: \x{660}A\x{660} ++\= Expect no match ++ \x{660}\x{660}\x{660} ++No match ++ + # VARIOUS + + /[\d\s\w]+/a,ucp,utf +-- +2.33.0 + diff --git a/backport-Improve-error-offsets-for-character-classes-548.patch b/backport-Improve-error-offsets-for-character-classes-548.patch new file mode 100644 index 0000000000000000000000000000000000000000..0a714e1611601d5c896f52bfdaa0bc22100fff41 --- /dev/null +++ b/backport-Improve-error-offsets-for-character-classes-548.patch @@ -0,0 +1,425 @@ +From 6185344ed8617ff84a08764e808e5b3667c34a7a Mon Sep 17 00:00:00 2001 +From: Nicholas Wilson +Date: Wed, 6 Nov 2024 08:45:46 +0000 +Subject: [PATCH] Improve error offsets for character classes (#548) + +Conflict:don't modify alt_extended_class because fc38d9e784 is not merged; +don't modify class_op_state because class_op_state is not merged; adapt context +Reference:https://github.com/PCRE2Project/pcre2/commit/6185344ed8617ff84a08764e808e5b3667c34a7a + +* Error offset should be advanced by one character for "[\d-z]" + invalid range error + + The code does a 1-char lookahead for a hyphen, but then doesn't + advance the pointer to consume the hyphen when returning the error. + + Perl's error message (with "use warnings") does advance to just + after the hyphen, so PCRE2 should match. + + Fixes #545. + +* Also improve error offsets for [[:bad:]], [[=...=]] and [z-\p{...}] + cases +--- + src/pcre2_compile.c | 67 +++++++++++++++++++------------------- + testdata/testinput2 | 8 +++++ + testdata/testinput5 | 8 +++++ + testdata/testoutput2 | 76 +++++++++++++++++++++++++------------------- + testdata/testoutput5 | 14 +++++++- + 5 files changed, 106 insertions(+), 67 deletions(-) + +diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c +index 32db44db..290e759b 100644 +--- a/src/pcre2_compile.c ++++ b/src/pcre2_compile.c +@@ -3563,6 +3563,7 @@ while (ptr < ptrend) + + if (class_range_state == RANGE_STARTED) + { ++ ptr = tempptr + 2; + errorcode = ERR50; + goto FAILED; + } +@@ -3584,8 +3585,9 @@ while (ptr < ptrend) + + if (*ptr != CHAR_COLON) + { ++ ptr = tempptr + 2; + errorcode = ERR13; +- goto FAILED_BACK; ++ goto FAILED; + } + + if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) +@@ -3595,19 +3597,18 @@ while (ptr < ptrend) + } + + posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); ++ ptr = tempptr + 2; + if (posix_class < 0) + { + errorcode = ERR30; + goto FAILED; + } +- ptr = tempptr + 2; + + /* Set "a hyphen is forbidden to be the start of a range". For the '-]' + case, the hyphen is treated as a literal, but for '-1' it is disallowed + (because it would be interpreted as range). */ + + class_range_state = RANGE_FORBID_NO; +- class_range_forbid_ptr = ptr; + + /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some + of the POSIX classes are converted to use Unicode properties \p or \P +@@ -3664,6 +3665,7 @@ while (ptr < ptrend) + { + *parsed_pattern++ = CHAR_MINUS; + class_range_state = RANGE_FORBID_STARTED; ++ class_range_forbid_ptr = ptr; + } + + /* Handle a literal character */ +@@ -3746,37 +3748,8 @@ while (ptr < ptrend) + errorcode = ERR7; + ptr--; + goto FAILED; +- } + +- /* The second part of a range can be a single-character escape +- sequence (detected above), but not any of the other escapes. Perl +- treats a hyphen as a literal in such circumstances. However, in Perl's +- warning mode, a warning is given, so PCRE now faults it, as it is +- almost certainly a mistake on the user's part. */ +- +- if (class_range_state == RANGE_STARTED) +- { +- errorcode = ERR50; +- goto FAILED; +- } +- /* Perl gives a warning unless the hyphen following a multi-character +- escape is the last character in the class. PCRE throws an error. */ +- if (class_range_state == RANGE_FORBID_STARTED) +- { +- ptr = class_range_forbid_ptr; +- errorcode = ERR50; +- goto FAILED; +- } +- +- /* Of the remaining escapes, only those that define characters are +- allowed in a class. None may start a range. */ +- +- class_range_state = RANGE_FORBID_NO; +- class_range_forbid_ptr = ptr; +- +- switch(escape) +- { +- case ESC_N: ++ case ESC_N: /* Not permitted by Perl either */ + errorcode = ERR71; + goto FAILED; + +@@ -3813,7 +3786,6 @@ while (ptr < ptrend) + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; +- class_range_forbid_ptr = ptr; + } + #else + errorcode = ERR45; +@@ -3826,6 +3798,33 @@ while (ptr < ptrend) + ptr--; + goto FAILED; + } ++ ++ /* All the switch-cases above which end in "break" describe a set ++ of characters. None may start a range. */ ++ ++ /* The second part of a range can be a single-character escape ++ sequence (detected above), but not any of the other escapes. Perl ++ treats a hyphen as a literal in such circumstances. However, in Perl's ++ warning mode, a warning is given, so PCRE now faults it, as it is ++ almost certainly a mistake on the user's part. */ ++ ++ if (class_range_state == RANGE_STARTED) ++ { ++ errorcode = ERR50; ++ goto FAILED; ++ } ++ ++ /* Perl gives a warning unless the hyphen following a multi-character ++ escape is the last character in the class. PCRE throws an error. */ ++ ++ if (class_range_state == RANGE_FORBID_STARTED) ++ { ++ ptr = class_range_forbid_ptr; ++ errorcode = ERR50; ++ goto FAILED; ++ } ++ ++ class_range_state = RANGE_FORBID_NO; + } + + /* Proceed to next thing in the class. */ +diff --git a/testdata/testinput2 b/testdata/testinput2 +index 61b94e69..1fbb778e 100644 +--- a/testdata/testinput2 ++++ b/testdata/testinput2 +@@ -7008,4 +7008,12 @@ a)"xI + + /[[:digit:]\Q\E-H]+/ + ++/[z-[:space:]]/ ++ ++/[z-\d]/ ++ ++/[[:space:]-z]/ ++ ++/[\d-z]/ ++ + # End of testinput2 +diff --git a/testdata/testinput5 b/testdata/testinput5 +index 494371b5..f3faeb8f 100644 +--- a/testdata/testinput5 ++++ b/testdata/testinput5 +@@ -2458,4 +2458,12 @@ + /abc/utf,substitute_extended,replace=>\777< + abc + ++/[z-\p{Lu}]/ ++ ++/[z-\pL]/ ++ ++/[\p{Lu}-z]/ ++ ++/[\pL-z]/ ++ + # End of testinput5 +diff --git a/testdata/testoutput2 b/testdata/testoutput2 +index 86bfe964..99714596 100644 +--- a/testdata/testoutput2 ++++ b/testdata/testoutput2 +@@ -2176,13 +2176,13 @@ Starting code units: % 0 1 A B C D E F G H I J K L M N O P Q R S T U V W + Subject length lower bound = 1 + + /[[.ch.]]/I +-Failed: error 113 at offset 1: POSIX collating elements are not supported ++Failed: error 113 at offset 7: POSIX collating elements are not supported + + /[[=ch=]]/I +-Failed: error 113 at offset 1: POSIX collating elements are not supported ++Failed: error 113 at offset 7: POSIX collating elements are not supported + + /[[:rhubarb:]]/I +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 12: unknown POSIX class name + + /[[:upper:]]/Ii + Capture group count = 0 +@@ -8722,31 +8722,31 @@ Failed: error 162 at offset 4: subpattern name expected + Failed: error 162 at offset 4: subpattern name expected + + /[[:foo:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 8: unknown POSIX class name + + /[[:1234:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 9: unknown POSIX class name + + /[[:f\oo:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 9: unknown POSIX class name + + /[[: :]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 6: unknown POSIX class name + + /[[:...:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 8: unknown POSIX class name + + /[[:l\ower:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 11: unknown POSIX class name + + /[[:abc\:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 9: unknown POSIX class name + + /[abc[:x\]pqr:]]/ +-Failed: error 130 at offset 6: unknown POSIX class name ++Failed: error 130 at offset 14: unknown POSIX class name + + /[[:a\dz:]]/ +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 9: unknown POSIX class name + + /(^(a|b\g<-1'c))/ + Failed: error 157 at offset 8: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number +@@ -11409,7 +11409,7 @@ Failed: error 171 at offset 4: \N is not supported in a class + aNc + + /a[B-\Nc]/ +-Failed: error 150 at offset 6: invalid range in character class ++Failed: error 171 at offset 6: \N is not supported in a class + + /a[B\Nc]/ + Failed: error 171 at offset 5: \N is not supported in a class +@@ -13232,16 +13232,16 @@ Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+} + ------------------------------------------------------------------ + + /[a-[:digit:]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 12: invalid range in character class + + /[A-[:digit:]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 12: invalid range in character class + + /[a-[.xxx.]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 10: invalid range in character class + + /[a-[=xxx=]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 10: invalid range in character class + + /[a-[!xxx!]]+/ + Failed: error 108 at offset 3: range out of order in character class +@@ -13362,7 +13362,7 @@ No match + No match + + /[a[:<:]] should give error/ +-Failed: error 130 at offset 4: unknown POSIX class name ++Failed: error 130 at offset 7: unknown POSIX class name + + /(?=ab\K)/aftertext,allow_lookaround_bsk + abcd\=startchar +@@ -15510,11 +15510,11 @@ Failed: error 125 at offset 13: lookbehind assertion is not fixed length + # Perl accepts these, but gives a warning. We can't warn, so give an error. + + /[a-[:digit:]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 12: invalid range in character class + a-a9-a + + /[A-[:digit:]]+/ +-Failed: error 150 at offset 4: invalid range in character class ++Failed: error 150 at offset 12: invalid range in character class + A-A9-A + + /[a-\d]+/ +@@ -15651,7 +15651,7 @@ Failed: error 128 at offset 63: assertion expected after (?( or (?(?C) + .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X + + /[:[:alnum:]-[[a:lnum:]+/ +-Failed: error 150 at offset 11: invalid range in character class ++Failed: error 150 at offset 12: invalid range in character class + + /((?(?C'')\QX\E(?!((?(?C'')(?!X=X));=)r*X=X));=)/ + Failed: error 128 at offset 11: assertion expected after (?( or (?(?C) +@@ -16285,10 +16285,10 @@ Subject length lower bound = 3 + ------------------------------------------------------------------ + + /[Q-\N]/B,bad_escape_is_literal +-Failed: error 150 at offset 5: invalid range in character class ++Failed: error 171 at offset 5: \N is not supported in a class + + /[\s-_]/bad_escape_is_literal +-Failed: error 150 at offset 3: invalid range in character class ++Failed: error 150 at offset 4: invalid range in character class + + /[_-\s]/bad_escape_is_literal + Failed: error 150 at offset 5: invalid range in character class +@@ -16443,19 +16443,19 @@ No match + No match + + /[[:digit:]-a]/ +-Failed: error 150 at offset 10: invalid range in character class ++Failed: error 150 at offset 11: invalid range in character class + + /[[:digit:]-[:print:]]/ +-Failed: error 150 at offset 10: invalid range in character class ++Failed: error 150 at offset 11: invalid range in character class + + /[\d-a]/ +-Failed: error 150 at offset 3: invalid range in character class ++Failed: error 150 at offset 4: invalid range in character class + + /[\H-z]/ +-Failed: error 150 at offset 3: invalid range in character class ++Failed: error 150 at offset 4: invalid range in character class + + /[\d-[:print:]]/ +-Failed: error 150 at offset 3: invalid range in character class ++Failed: error 150 at offset 4: invalid range in character class + + # Perl gets the second of these wrong, giving no match. + +@@ -17816,16 +17816,28 @@ Subject length lower bound = 2 + 0: a + + /[[:digit:] -Z]/xx +-Failed: error 150 at offset 10: invalid range in character class ++Failed: error 150 at offset 14: invalid range in character class + + /[\d -Z]/xx +-Failed: error 150 at offset 3: invalid range in character class ++Failed: error 150 at offset 7: invalid range in character class + + /[[:digit:]\E-H]/ +-Failed: error 150 at offset 10: invalid range in character class ++Failed: error 150 at offset 13: invalid range in character class + + /[[:digit:]\Q\E-H]+/ +-Failed: error 150 at offset 10: invalid range in character class ++Failed: error 150 at offset 15: invalid range in character class ++ ++/[z-[:space:]]/ ++Failed: error 150 at offset 12: invalid range in character class ++ ++/[z-\d]/ ++Failed: error 150 at offset 5: invalid range in character class ++ ++/[[:space:]-z]/ ++Failed: error 150 at offset 11: invalid range in character class ++ ++/[\d-z]/ ++Failed: error 150 at offset 4: invalid range in character class + + # End of testinput2 + Error -70: PCRE2_ERROR_BADDATA (unknown error number) +diff --git a/testdata/testoutput5 b/testdata/testoutput5 +index bf06ee12..0dba11c6 100644 +--- a/testdata/testoutput5 ++++ b/testdata/testoutput5 +@@ -795,7 +795,7 @@ No match + No match + + /[[:a\x{100}b:]]/utf +-Failed: error 130 at offset 3: unknown POSIX class name ++Failed: error 130 at offset 14: unknown POSIX class name + + /a[^]b/utf,allow_empty_class,match_unset_backref + a\x{1234}b +@@ -5403,4 +5403,16 @@ No match + abc + 1: >\x{1ff}< + ++/[z-\p{Lu}]/ ++Failed: error 150 at offset 9: invalid range in character class ++ ++/[z-\pL]/ ++Failed: error 150 at offset 6: invalid range in character class ++ ++/[\p{Lu}-z]/ ++Failed: error 150 at offset 8: invalid range in character class ++ ++/[\pL-z]/ ++Failed: error 150 at offset 5: invalid range in character class ++ + # End of testinput5 +-- +2.33.0 + diff --git a/backport-Non-recursive-scan-prefix-in-JIT-560.patch b/backport-Non-recursive-scan-prefix-in-JIT-560.patch new file mode 100644 index 0000000000000000000000000000000000000000..f4ac1b57395db48f5bc46a71a1f88e1e71d37756 --- /dev/null +++ b/backport-Non-recursive-scan-prefix-in-JIT-560.patch @@ -0,0 +1,459 @@ +From 6f2da25f009ff463cd9357ae5ebe452fbec8ab5c Mon Sep 17 00:00:00 2001 +From: Zoltan Herczeg +Date: Fri, 15 Nov 2024 13:21:03 +0100 +Subject: [PATCH] Non-recursive scan prefix in JIT (#560) + +Conflict:NA +Reference:https://github.com/PCRE2Project/pcre2/commit/6f2da25f009ff463cd9357ae5ebe452fbec8ab5c + +--- + src/pcre2_jit_compile.c | 238 ++++++++++++++++++++++++++++------------ + src/pcre2_jit_test.c | 1 + + 2 files changed, 168 insertions(+), 71 deletions(-) + +diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c +index 127c393d..4449d59f 100644 +--- a/src/pcre2_jit_compile.c ++++ b/src/pcre2_jit_compile.c +@@ -5670,11 +5670,38 @@ if (last) + chars->last_count++; + } + +-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count) ++/* Value can be increased if needed. Patterns ++such as /(a|){33}b/ can exhaust the stack. ++ ++Note: /(a|){29}b/ already stops scan_prefix() ++because it reaches the maximum step_count. */ ++#define SCAN_PREFIX_STACK_END 32 ++ ++/* ++Scan prefix stores the prefix string in the chars array. ++The elements of the chars array is either small character ++sets or "any" (count is set to 255). ++ ++Examples (the chars array is represented by a simple regex): ++ ++/(abc|xbyd)/ prefix: /[ax]b[cy]/ (length: 3) ++/a[a-z]b+c/ prefix: a.b (length: 3) ++/ab?cd/ prefix: a[bc][cd] (length: 3) ++/(ab|cd)|(ef|gh)/ prefix: [aceg][bdfh] (length: 2) ++ ++The length is returned by scan_prefix(). The length is ++less than or equal than the minimum length of the pattern. ++*/ ++ ++static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars) + { +-/* Recursive function, which scans prefix literals. */ ++fast_forward_char_data *chars_start = chars; ++fast_forward_char_data *chars_end = chars + MAX_N_CHARS; ++PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END]; ++fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END]; ++sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END]; + BOOL last, any, class, caseless; +-int len, repeat, len_save, consumed = 0; ++int stack_ptr, step_count, repeat, len, len_save; + sljit_u32 chr; /* Any unicode character. */ + sljit_u8 *bytes, *bytes_end, byte; + PCRE2_SPTR alternative, cc_save, oc; +@@ -5687,11 +5714,44 @@ PCRE2_UCHAR othercase[1]; + #endif + + repeat = 1; ++stack_ptr = 0; ++step_count = 10000; + while (TRUE) + { +- if (*rec_count == 0) ++ if (--step_count == 0) + return 0; +- (*rec_count)--; ++ ++ SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS); ++ ++ if (chars >= chars_end) ++ { ++ if (stack_ptr == 0) ++ return (int)(chars_end - chars_start); ++ ++ --stack_ptr; ++ cc = cc_stack[stack_ptr]; ++ chars = chars_stack[stack_ptr]; ++ ++ if (chars >= chars_end) ++ continue; ++ ++ if (next_alternative_stack[stack_ptr] != 0) ++ { ++ /* When an alternative is processed, the ++ next alternative is pushed onto the stack. */ ++ SLJIT_ASSERT(*cc == OP_ALT); ++ alternative = cc + GET(cc, 1); ++ if (*alternative == OP_ALT) ++ { ++ SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END); ++ SLJIT_ASSERT(chars_stack[stack_ptr] == chars); ++ SLJIT_ASSERT(next_alternative_stack[stack_ptr] == 1); ++ cc_stack[stack_ptr] = alternative; ++ stack_ptr++; ++ } ++ cc += 1 + LINK_SIZE; ++ } ++ } + + last = TRUE; + any = FALSE; +@@ -5768,9 +5828,17 @@ while (TRUE) + #ifdef SUPPORT_UNICODE + if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); + #endif +- max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count); +- if (max_chars == 0) +- return consumed; ++ if (stack_ptr >= SCAN_PREFIX_STACK_END) ++ { ++ chars_end = chars; ++ continue; ++ } ++ ++ cc_stack[stack_ptr] = cc + len; ++ chars_stack[stack_ptr] = chars; ++ next_alternative_stack[stack_ptr] = 0; ++ stack_ptr++; ++ + last = FALSE; + break; + +@@ -5788,12 +5856,18 @@ while (TRUE) + case OP_CBRA: + case OP_CBRAPOS: + alternative = cc + GET(cc, 1); +- while (*alternative == OP_ALT) ++ if (*alternative == OP_ALT) + { +- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count); +- if (max_chars == 0) +- return consumed; +- alternative += GET(alternative, 1); ++ if (stack_ptr >= SCAN_PREFIX_STACK_END) ++ { ++ chars_end = chars; ++ continue; ++ } ++ ++ cc_stack[stack_ptr] = alternative; ++ chars_stack[stack_ptr] = chars; ++ next_alternative_stack[stack_ptr] = 1; ++ stack_ptr++; + } + + if (*cc == OP_CBRA || *cc == OP_CBRAPOS) +@@ -5804,14 +5878,21 @@ while (TRUE) + case OP_CLASS: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE)) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + class = TRUE; + break; + + case OP_NCLASS: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +- if (common->utf) return consumed; ++ if (common->utf) ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + class = TRUE; + break; +@@ -5819,7 +5900,11 @@ while (TRUE) + #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 + case OP_XCLASS: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +- if (common->utf) return consumed; ++ if (common->utf) ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc += GET(cc, 1); +@@ -5829,7 +5914,10 @@ while (TRUE) + case OP_DIGIT: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE)) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc++; +@@ -5838,7 +5926,10 @@ while (TRUE) + case OP_WHITESPACE: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE)) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc++; +@@ -5847,7 +5938,10 @@ while (TRUE) + case OP_WORDCHAR: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE)) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc++; +@@ -5863,7 +5957,11 @@ while (TRUE) + case OP_ANY: + case OP_ALLANY: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +- if (common->utf) return consumed; ++ if (common->utf) ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc++; +@@ -5873,7 +5971,11 @@ while (TRUE) + case OP_NOTPROP: + case OP_PROP: + #if PCRE2_CODE_UNIT_WIDTH != 32 +- if (common->utf) return consumed; ++ if (common->utf) ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + cc += 1 + 2; +@@ -5888,7 +5990,11 @@ while (TRUE) + case OP_NOTEXACT: + case OP_NOTEXACTI: + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +- if (common->utf) return consumed; ++ if (common->utf) ++ { ++ chars_end = chars; ++ continue; ++ } + #endif + any = TRUE; + repeat = GET2(cc, 1); +@@ -5896,21 +6002,20 @@ while (TRUE) + break; + + default: +- return consumed; ++ chars_end = chars; ++ continue; + } + ++ SLJIT_ASSERT(chars < chars_end); ++ + if (any) + { + do + { + chars->count = 255; +- +- consumed++; +- if (--max_chars == 0) +- return consumed; + chars++; + } +- while (--repeat > 0); ++ while (--repeat > 0 && chars < chars_end); + + repeat = 1; + continue; +@@ -5921,17 +6026,27 @@ while (TRUE) + bytes = (sljit_u8*) (cc + 1); + cc += 1 + 32 / sizeof(PCRE2_UCHAR); + ++ SLJIT_ASSERT(last == TRUE && repeat == 1); + switch (*cc) + { +- case OP_CRSTAR: +- case OP_CRMINSTAR: +- case OP_CRPOSSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSQUERY: +- max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count); +- if (max_chars == 0) +- return consumed; ++ last = FALSE; ++ /* Fall through */ ++ case OP_CRSTAR: ++ case OP_CRMINSTAR: ++ case OP_CRPOSSTAR: ++ if (stack_ptr >= SCAN_PREFIX_STACK_END) ++ { ++ chars_end = chars; ++ continue; ++ } ++ ++ cc_stack[stack_ptr] = ++cc; ++ chars_stack[stack_ptr] = chars; ++ next_alternative_stack[stack_ptr] = 0; ++ stack_ptr++; + break; + + default: +@@ -5945,7 +6060,13 @@ while (TRUE) + case OP_CRPOSRANGE: + repeat = GET2(cc, 1); + if (repeat <= 0) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } ++ ++ last = (repeat != (int)GET2(cc, 1 + IMM2_SIZE)); ++ cc += 1 + 2 * IMM2_SIZE; + break; + } + +@@ -5980,36 +6101,13 @@ while (TRUE) + bytes = bytes_end - 32; + } + +- consumed++; +- if (--max_chars == 0) +- return consumed; + chars++; + } +- while (--repeat > 0); +- +- switch (*cc) +- { +- case OP_CRSTAR: +- case OP_CRMINSTAR: +- case OP_CRPOSSTAR: +- return consumed; +- +- case OP_CRQUERY: +- case OP_CRMINQUERY: +- case OP_CRPOSQUERY: +- cc++; +- break; +- +- case OP_CRRANGE: +- case OP_CRMINRANGE: +- case OP_CRPOSRANGE: +- if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE)) +- return consumed; +- cc += 1 + 2 * IMM2_SIZE; +- break; +- } ++ while (--repeat > 0 && chars < chars_end); + + repeat = 1; ++ if (last) ++ chars_end = chars; + continue; + } + +@@ -6025,7 +6123,10 @@ while (TRUE) + { + GETCHAR(chr, cc); + if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len) +- return consumed; ++ { ++ chars_end = chars; ++ continue; ++ } + } + else + #endif +@@ -6056,7 +6157,6 @@ while (TRUE) + do + { + len--; +- consumed++; + + chr = *cc; + add_prefix_char(*cc, chars, len == 0); +@@ -6064,15 +6164,13 @@ while (TRUE) + if (caseless) + add_prefix_char(*oc, chars, len == 0); + +- if (--max_chars == 0) +- return consumed; + chars++; + cc++; + oc++; + } +- while (len > 0); ++ while (len > 0 && chars < chars_end); + +- if (--repeat == 0) ++ if (--repeat == 0 || chars >= chars_end) + break; + + len = len_save; +@@ -6081,7 +6179,7 @@ while (TRUE) + + repeat = 1; + if (last) +- return consumed; ++ chars_end = chars; + } + } + +@@ -6251,7 +6349,6 @@ int i, max, from; + int range_right = -1, range_len; + sljit_u8 *update_table = NULL; + BOOL in_range; +-sljit_u32 rec_count; + + for (i = 0; i < MAX_N_CHARS; i++) + { +@@ -6259,8 +6356,7 @@ for (i = 0; i < MAX_N_CHARS; i++) + chars[i].last_count = 0; + } + +-rec_count = 10000; +-max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count); ++max = scan_prefix(common, common->start, chars); + + if (max < 1) + return FALSE; +diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c +index 28bc7af9..066095fe 100644 +--- a/src/pcre2_jit_test.c ++++ b/src/pcre2_jit_test.c +@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = { + { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" }, + { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" }, + { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" }, ++ { M, A, 0, 0, "(?:a?|a)b", "ba" }, + + /* Greedy and non-greedy + operators */ + { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" }, +-- +2.33.0 + diff --git a/backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch b/backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch new file mode 100644 index 0000000000000000000000000000000000000000..c0c196c948c38cbc353b04b440c0d3291ca386cc --- /dev/null +++ b/backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch @@ -0,0 +1,270 @@ +From 64549346f044dec18d18d06c2d08a68a68e26817 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= +Date: Sun, 9 Apr 2023 04:29:46 -0700 +Subject: [PATCH] avoid inconsistency between \d and [:digit:] when using /a + (#223) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Conflict:don't modify Changelog and doc/*; keep pcre2.h.generic consistent +with pcre2.h.in according to 1de7291 +Reference:https://github.com/PCRE2Project/pcre2/commit/64549346f044dec18d18d06c2d08a68a68e26817 + +Since a608946 (Additional PCRE2_EXTRA_ASCII_xxx code, 2023-02-01) +PCRE2_EXTRA_ASCII_BSD could be used to restrict \d to ASCII causing +the following inconsistent behaviour in UCP mode. + + PCRE2 version 10.43-DEV 2023-01-15 + re> /\d/utf,ucp,ascii_bsd + data> ٣ + No match + data> + re> /[[:digit:]]/utf,ucp,ascii_bsd + data> ٣ + 0: \x{663} + +It has been suggested[1] that the change to match \p{Nd} when Unicode +is enabled for [:digit:] might had been unintentional and a bug, as +[:digit:] should be able to be POSIX compatible, so add a new flag +PCRE2_EXTRA_ASCII_DIGIT to avoid changing its definition in UCP mode. + +[1] https://lore.kernel.org/git/CANgJU+U+xXsh9psd0z5Xjr+Se5QgdKkjQ7LUQ-PdUULSN3n4+g@mail.gmail.com/ +--- + src/pcre2.h.generic | 6 ++++++ + src/pcre2.h.in | 1 + + src/pcre2_compile.c | 6 ++++-- + src/pcre2test.c | 4 +++- + testdata/testinput5 | 10 +++++++++- + testdata/testinput7 | 10 ++++++++-- + testdata/testoutput5 | 19 ++++++++++++++++++- + testdata/testoutput7 | 13 +++++++++++-- + 8 files changed, 60 insertions(+), 9 deletions(-) + +diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic +index dad774ce..05cf9bc1 100644 +--- a/src/pcre2.h.generic ++++ b/src/pcre2.h.generic +@@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution + #define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ + #define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */ + #define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */ ++#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */ ++#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */ ++#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */ ++#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ ++#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ ++#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ + + /* These are for pcre2_jit_compile(). */ + +diff --git a/src/pcre2.h.in b/src/pcre2.h.in +index 7202c633..cd7fdcf2 100644 +--- a/src/pcre2.h.in ++++ b/src/pcre2.h.in +@@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution + #define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */ + #define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ + #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ ++#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ + + /* These are for pcre2_jit_compile(). */ + +diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c +index 95c4a79d..634360b7 100644 +--- a/src/pcre2_compile.c ++++ b/src/pcre2_compile.c +@@ -786,7 +786,8 @@ are allowed. */ + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ + PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ + PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \ +- PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX) ++ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \ ++ PCRE2_EXTRA_ASCII_DIGIT) + + /* Compile time error code numbers. They are given names so that they can more + easily be tracked. When a new number is added, the tables called eint1 and +@@ -3581,7 +3582,8 @@ while (ptr < ptrend) + + #ifdef SUPPORT_UNICODE + if ((options & PCRE2_UCP) != 0 && +- (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) ++ (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 && ++ !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0)) + { + int ptype = posix_substitutes[2*posix_class]; + int pvalue = posix_substitutes[2*posix_class + 1]; +diff --git a/src/pcre2test.c b/src/pcre2test.c +index 4da3ef90..21b19370 100644 +--- a/src/pcre2test.c ++++ b/src/pcre2test.c +@@ -651,6 +651,7 @@ static modstruct modlist[] = { + { "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) }, + { "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) }, + { "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) }, ++ { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) }, + { "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) }, + { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, + { "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) }, +@@ -4294,13 +4295,14 @@ show_compile_extra_options(uint32_t options, const char *before, + const char *after) + { + if (options == 0) fprintf(outfile, "%s %s", before, after); +-else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s", ++else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + before, + ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "", + ((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "", + ((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "", + ((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "", + ((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "", ++ ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "", + ((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "", + ((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "", + ((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "", +diff --git a/testdata/testinput5 b/testdata/testinput5 +index 0f105408..0624a0c3 100644 +--- a/testdata/testinput5 ++++ b/testdata/testinput5 +@@ -1215,6 +1215,8 @@ + + /[[:digit:]]/B,ucp + ++/[[:digit:]]/B,ucp,ascii_digit ++ + /[[:graph:]]/B,ucp + + /[[:print:]]/B,ucp +@@ -1227,7 +1229,7 @@ + + /[[:xdigit:]]/B,ucp + +-# Unicode properties for \b abd \B ++# Unicode properties for \b and \B + + /\b...\B/utf,ucp + abc_ +@@ -2431,6 +2433,12 @@ + /[[:digit:]]+/utf,ucp + 123\x{660}456 + ++/[[:digit:]]+/utf,ucp,ascii_digit ++ 123\x{660}456 ++ ++/[[:digit:]]+/g,utf,ucp,ascii_digit ++ 123\x{660}456 ++ + /[[:digit:]]+/utf,ucp,ascii_posix + 123\x{660}456 + +diff --git a/testdata/testinput7 b/testdata/testinput7 +index a2b7fb8d..96deaa30 100644 +--- a/testdata/testinput7 ++++ b/testdata/testinput7 +@@ -1657,7 +1657,7 @@ + /^[\p{Xwd}]+/utf + ABCD1234\x{6ca}\x{a6c}\x{10a7}_ + +-# Unicode properties for \b abd \B ++# Unicode properties for \b and \B + + /\b...\B/utf,ucp + abc_ +@@ -2435,9 +2435,15 @@ + /[[:digit:]]+/utf,ucp + 123\x{660}456 + ++/[[:digit:]]+/utf,ucp,ascii_digit ++ 123\x{660}456 ++ ++/[[:digit:]]+/g,utf,ucp,ascii_digit ++ 123\x{660}456 ++ + /[[:digit:]]+/utf,ucp,ascii_posix + 123\x{660}456 +- ++ + />[[:space:]]+\x{a0} \x{a0}< + >\x{a0}\x{a0}\x{a0}< +diff --git a/testdata/testoutput5 b/testdata/testoutput5 +index 3cee990e..febcc954 100644 +--- a/testdata/testoutput5 ++++ b/testdata/testoutput5 +@@ -2520,6 +2520,14 @@ No match + End + ------------------------------------------------------------------ + ++/[[:digit:]]/B,ucp,ascii_digit ++------------------------------------------------------------------ ++ Bra ++ [0-9] ++ Ket ++ End ++------------------------------------------------------------------ ++ + /[[:graph:]]/B,ucp + ------------------------------------------------------------------ + Bra +@@ -2568,7 +2576,7 @@ No match + End + ------------------------------------------------------------------ + +-# Unicode properties for \b abd \B ++# Unicode properties for \b and \B + + /\b...\B/utf,ucp + abc_ +@@ -5359,6 +5367,15 @@ No match + 123\x{660}456 + 0: 123\x{660}456 + ++/[[:digit:]]+/utf,ucp,ascii_digit ++ 123\x{660}456 ++ 0: 123 ++ ++/[[:digit:]]+/g,utf,ucp,ascii_digit ++ 123\x{660}456 ++ 0: 123 ++ 0: 456 ++ + /[[:digit:]]+/utf,ucp,ascii_posix + 123\x{660}456 + 0: 123 +diff --git a/testdata/testoutput7 b/testdata/testoutput7 +index 4065981d..d98178e6 100644 +--- a/testdata/testoutput7 ++++ b/testdata/testoutput7 +@@ -2853,7 +2853,7 @@ No match + ABCD1234\x{6ca}\x{a6c}\x{10a7}_ + 0: ABCD1234\x{6ca}\x{a6c}\x{10a7}_ + +-# Unicode properties for \b abd \B ++# Unicode properties for \b and \B + + /\b...\B/utf,ucp + abc_ +@@ -4080,10 +4080,19 @@ No match + 123\x{660}456 + 0: 123\x{660}456 + ++/[[:digit:]]+/utf,ucp,ascii_digit ++ 123\x{660}456 ++ 0: 123 ++ ++/[[:digit:]]+/g,utf,ucp,ascii_digit ++ 123\x{660}456 ++ 0: 123 ++ 0: 456 ++ + /[[:digit:]]+/utf,ucp,ascii_posix + 123\x{660}456 + 0: 123 +- ++ + />[[:space:]]+\x{a0} \x{a0}< + 0: >\x{a0} \x{a0}< +-- +2.33.0 + diff --git a/pcre2.spec b/pcre2.spec index fec618d9d90b07bca0b1c816a3e0f83e067c1ec3..f9103243422388872937bc8c3d7dbf239f178b5f 100644 --- a/pcre2.spec +++ b/pcre2.spec @@ -1,6 +1,6 @@ Name: pcre2 Version: 10.42 -Release: 11 +Release: 12 Summary: Perl Compatible Regular Expressions License: BSD URL: http://www.pcre.org/ @@ -39,6 +39,11 @@ Patch6027: backport-Add-Perl-titlecasing-475.patch Patch6028: backport-Fix-incorrect-positive-error-code-from-pcre2_substitute.patch Patch6029: backport-pcre2_compile-avoid-1-byte-buffer-overread-parsing-V.patch Patch6030: backport-Improve-error-message-for-N-name-in-character-classes.patch +Patch6031: backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch +Patch6032: backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch +Patch6033: backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch +Patch6034: backport-Improve-error-offsets-for-character-classes-548.patch +Patch6035: backport-Non-recursive-scan-prefix-in-JIT-560.patch BuildRequires: autoconf libtool automake coreutils gcc make readline-devel Obsoletes: pcre2-utf16 pcre2-utf32 pcre2-tools @@ -156,6 +161,14 @@ make check %{_pkgdocdir}/html/ %changelog +* Tue Dec 10 2024 hugel - 10.42-12 +- DESC:sync patches from upstream + backport-Further-ASCII-tests-and-minor-bugfix-plus-ChangeLog-.patch + backport-avoid-inconsistency-between-d-and-digit-when-using-a.patch + backport-Fix-the-lookahead-after-d-or-posix-to-skip-whitespac.patch + backport-Improve-error-offsets-for-character-classes-548.patch + backport-Non-recursive-scan-prefix-in-JIT-560.patch + * Tue Nov 19 2024 yanglongkang - 10.42-11 - DESC:sync patches from upstream