代码拉取完成,页面将自动刷新
From 6f2da25f009ff463cd9357ae5ebe452fbec8ab5c Mon Sep 17 00:00:00 2001
From: Zoltan Herczeg <zherczeg7@gmail.com>
Date: Fri, 15 Nov 2024 13:21:03 +0100
Subject: [PATCH] Non-recursive scan prefix in JIT (#560)
Conflict:NA
Reference:https://github.com/PCRE2Project/pcre2/commit/6f2da25f009ff463cd9357ae5ebe452fbec8ab5c
---
src/pcre2_jit_compile.c | 238 ++++++++++++++++++++++++++++------------
src/pcre2_jit_test.c | 1 +
2 files changed, 168 insertions(+), 71 deletions(-)
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index 127c393d..4449d59f 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -5670,11 +5670,38 @@ if (last)
chars->last_count++;
}
-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars, int max_chars, sljit_u32 *rec_count)
+/* Value can be increased if needed. Patterns
+such as /(a|){33}b/ can exhaust the stack.
+
+Note: /(a|){29}b/ already stops scan_prefix()
+because it reaches the maximum step_count. */
+#define SCAN_PREFIX_STACK_END 32
+
+/*
+Scan prefix stores the prefix string in the chars array.
+The elements of the chars array is either small character
+sets or "any" (count is set to 255).
+
+Examples (the chars array is represented by a simple regex):
+
+/(abc|xbyd)/ prefix: /[ax]b[cy]/ (length: 3)
+/a[a-z]b+c/ prefix: a.b (length: 3)
+/ab?cd/ prefix: a[bc][cd] (length: 3)
+/(ab|cd)|(ef|gh)/ prefix: [aceg][bdfh] (length: 2)
+
+The length is returned by scan_prefix(). The length is
+less than or equal than the minimum length of the pattern.
+*/
+
+static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, fast_forward_char_data *chars)
{
-/* Recursive function, which scans prefix literals. */
+fast_forward_char_data *chars_start = chars;
+fast_forward_char_data *chars_end = chars + MAX_N_CHARS;
+PCRE2_SPTR cc_stack[SCAN_PREFIX_STACK_END];
+fast_forward_char_data *chars_stack[SCAN_PREFIX_STACK_END];
+sljit_u8 next_alternative_stack[SCAN_PREFIX_STACK_END];
BOOL last, any, class, caseless;
-int len, repeat, len_save, consumed = 0;
+int stack_ptr, step_count, repeat, len, len_save;
sljit_u32 chr; /* Any unicode character. */
sljit_u8 *bytes, *bytes_end, byte;
PCRE2_SPTR alternative, cc_save, oc;
@@ -5687,11 +5714,44 @@ PCRE2_UCHAR othercase[1];
#endif
repeat = 1;
+stack_ptr = 0;
+step_count = 10000;
while (TRUE)
{
- if (*rec_count == 0)
+ if (--step_count == 0)
return 0;
- (*rec_count)--;
+
+ SLJIT_ASSERT(chars <= chars_start + MAX_N_CHARS);
+
+ if (chars >= chars_end)
+ {
+ if (stack_ptr == 0)
+ return (int)(chars_end - chars_start);
+
+ --stack_ptr;
+ cc = cc_stack[stack_ptr];
+ chars = chars_stack[stack_ptr];
+
+ if (chars >= chars_end)
+ continue;
+
+ if (next_alternative_stack[stack_ptr] != 0)
+ {
+ /* When an alternative is processed, the
+ next alternative is pushed onto the stack. */
+ SLJIT_ASSERT(*cc == OP_ALT);
+ alternative = cc + GET(cc, 1);
+ if (*alternative == OP_ALT)
+ {
+ SLJIT_ASSERT(stack_ptr < SCAN_PREFIX_STACK_END);
+ SLJIT_ASSERT(chars_stack[stack_ptr] == chars);
+ SLJIT_ASSERT(next_alternative_stack[stack_ptr] == 1);
+ cc_stack[stack_ptr] = alternative;
+ stack_ptr++;
+ }
+ cc += 1 + LINK_SIZE;
+ }
+ }
last = TRUE;
any = FALSE;
@@ -5768,9 +5828,17 @@ while (TRUE)
#ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
#endif
- max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = cc + len;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 0;
+ stack_ptr++;
+
last = FALSE;
break;
@@ -5788,12 +5856,18 @@ while (TRUE)
case OP_CBRA:
case OP_CBRAPOS:
alternative = cc + GET(cc, 1);
- while (*alternative == OP_ALT)
+ if (*alternative == OP_ALT)
{
- max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
- alternative += GET(alternative, 1);
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = alternative;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 1;
+ stack_ptr++;
}
if (*cc == OP_CBRA || *cc == OP_CBRAPOS)
@@ -5804,14 +5878,21 @@ while (TRUE)
case OP_CLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
class = TRUE;
break;
case OP_NCLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
class = TRUE;
break;
@@ -5819,7 +5900,11 @@ while (TRUE)
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc += GET(cc, 1);
@@ -5829,7 +5914,10 @@ while (TRUE)
case OP_DIGIT:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5838,7 +5926,10 @@ while (TRUE)
case OP_WHITESPACE:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5847,7 +5938,10 @@ while (TRUE)
case OP_WORDCHAR:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE))
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5863,7 +5957,11 @@ while (TRUE)
case OP_ANY:
case OP_ALLANY:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc++;
@@ -5873,7 +5971,11 @@ while (TRUE)
case OP_NOTPROP:
case OP_PROP:
#if PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
cc += 1 + 2;
@@ -5888,7 +5990,11 @@ while (TRUE)
case OP_NOTEXACT:
case OP_NOTEXACTI:
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
- if (common->utf) return consumed;
+ if (common->utf)
+ {
+ chars_end = chars;
+ continue;
+ }
#endif
any = TRUE;
repeat = GET2(cc, 1);
@@ -5896,21 +6002,20 @@ while (TRUE)
break;
default:
- return consumed;
+ chars_end = chars;
+ continue;
}
+ SLJIT_ASSERT(chars < chars_end);
+
if (any)
{
do
{
chars->count = 255;
-
- consumed++;
- if (--max_chars == 0)
- return consumed;
chars++;
}
- while (--repeat > 0);
+ while (--repeat > 0 && chars < chars_end);
repeat = 1;
continue;
@@ -5921,17 +6026,27 @@ while (TRUE)
bytes = (sljit_u8*) (cc + 1);
cc += 1 + 32 / sizeof(PCRE2_UCHAR);
+ SLJIT_ASSERT(last == TRUE && repeat == 1);
switch (*cc)
{
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPOSSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSQUERY:
- max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count);
- if (max_chars == 0)
- return consumed;
+ last = FALSE;
+ /* Fall through */
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPOSSTAR:
+ if (stack_ptr >= SCAN_PREFIX_STACK_END)
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ cc_stack[stack_ptr] = ++cc;
+ chars_stack[stack_ptr] = chars;
+ next_alternative_stack[stack_ptr] = 0;
+ stack_ptr++;
break;
default:
@@ -5945,7 +6060,13 @@ while (TRUE)
case OP_CRPOSRANGE:
repeat = GET2(cc, 1);
if (repeat <= 0)
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
+
+ last = (repeat != (int)GET2(cc, 1 + IMM2_SIZE));
+ cc += 1 + 2 * IMM2_SIZE;
break;
}
@@ -5980,36 +6101,13 @@ while (TRUE)
bytes = bytes_end - 32;
}
- consumed++;
- if (--max_chars == 0)
- return consumed;
chars++;
}
- while (--repeat > 0);
-
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPOSSTAR:
- return consumed;
-
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- case OP_CRPOSQUERY:
- cc++;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- case OP_CRPOSRANGE:
- if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE))
- return consumed;
- cc += 1 + 2 * IMM2_SIZE;
- break;
- }
+ while (--repeat > 0 && chars < chars_end);
repeat = 1;
+ if (last)
+ chars_end = chars;
continue;
}
@@ -6025,7 +6123,10 @@ while (TRUE)
{
GETCHAR(chr, cc);
if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len)
- return consumed;
+ {
+ chars_end = chars;
+ continue;
+ }
}
else
#endif
@@ -6056,7 +6157,6 @@ while (TRUE)
do
{
len--;
- consumed++;
chr = *cc;
add_prefix_char(*cc, chars, len == 0);
@@ -6064,15 +6164,13 @@ while (TRUE)
if (caseless)
add_prefix_char(*oc, chars, len == 0);
- if (--max_chars == 0)
- return consumed;
chars++;
cc++;
oc++;
}
- while (len > 0);
+ while (len > 0 && chars < chars_end);
- if (--repeat == 0)
+ if (--repeat == 0 || chars >= chars_end)
break;
len = len_save;
@@ -6081,7 +6179,7 @@ while (TRUE)
repeat = 1;
if (last)
- return consumed;
+ chars_end = chars;
}
}
@@ -6251,7 +6349,6 @@ int i, max, from;
int range_right = -1, range_len;
sljit_u8 *update_table = NULL;
BOOL in_range;
-sljit_u32 rec_count;
for (i = 0; i < MAX_N_CHARS; i++)
{
@@ -6259,8 +6356,7 @@ for (i = 0; i < MAX_N_CHARS; i++)
chars[i].last_count = 0;
}
-rec_count = 10000;
-max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);
+max = scan_prefix(common, common->start, chars);
if (max < 1)
return FALSE;
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
index 28bc7af9..066095fe 100644
--- a/src/pcre2_jit_test.c
+++ b/src/pcre2_jit_test.c
@@ -286,6 +286,7 @@ static struct regression_test_case regression_test_cases[] = {
{ CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
{ MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
{ MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
+ { M, A, 0, 0, "(?:a?|a)b", "ba" },
/* Greedy and non-greedy + operators */
{ MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
--
2.33.0
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。