From f738245ad94307f126e0f8985c66a7aad508ad3d Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 8 Mar 2019 20:48:58 +0800 Subject: [PATCH 01/21] 1. add asserts If no error occurs, when the top level call returns: - the stack should be empty - state->repeat should be NULL This prevents the following commits to wreck the stack. --- .../Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst | 2 ++ Modules/sre_lib.h | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst diff --git a/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst new file mode 100644 index 00000000000000..fa27434a62862c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst @@ -0,0 +1,2 @@ +re module, fix a few bugs about capturing group. In rare cases, capturing +group gets an incorrect string. Patch by Ma Lin. \ No newline at end of file diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 437ab43f434a62..f5602f7f0ee433 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1310,8 +1310,15 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx_pos = ctx->last_ctx_pos; jump = ctx->jump; DATA_POP_DISCARD(ctx); - if (ctx_pos == -1) + if (ctx_pos == -1) { +#ifdef Py_DEBUG + if (ctx->toplevel && ret >= 0) { + assert(state->repeat == NULL); + assert(state->data_stack_base == 0); + } +#endif return ret; + } DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); switch (jump) { @@ -1360,7 +1367,8 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) break; } - return ret; /* should never get here */ + /* should never get here */ + Py_UNREACHABLE(); } /* need to reset capturing groups between two SRE(match) callings in loops */ From 0d1e658eadddd68a1c27a8f80e54af83666b2cbd Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 8 Mar 2019 20:58:41 +0800 Subject: [PATCH 02/21] 2. macro MARK_PUSH() bug MARK_PUSH() didn't protect MARK0 if it was the only available mark. This bug was reported in issue35859. --- Lib/test/test_re.py | 8 ++++++++ Modules/sre_lib.h | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0a77e6fe9edc03..ce357707ffd02d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2100,6 +2100,14 @@ def test_bug_34294(self): {'tag': 'foo', 'text': None}, {'tag': 'foo', 'text': None}]) + def test_bug_35859(self): + # MARK_PUSH() macro didn't protect MARK-0 + # if it was the only available mark. + self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) + self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index f5602f7f0ee433..b23bd31bde8d27 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -478,20 +478,20 @@ do { \ DATA_STACK_LOOKUP_AT(state,t,p,pos) #define MARK_PUSH(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ i = lastmark; /* ctx->lastmark may change if reallocated */ \ DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ } while (0) #define MARK_POP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ - do if (lastmark > 0) { \ + do if (lastmark >= 0) { \ DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ } while (0) From cbc10fc9afb385e08d8f3ddc7ab9c8b8ed318eb8 Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:11:23 +0800 Subject: [PATCH 03/21] 3. restore state->repeat at the first opportunity JUMP_MAX_UNTIL_3 and JUMP_MIN_UNTIL_2 are tails of a repeat. Before the JUMPs, sre sets state->repeat to the tail's repeat object: state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); But after the JUMPs, it doesn't restore state->repeat if the JUMP is SUCCESS. This won't cause a bug, because SRE_OP_REPEAT will restore state->repeat later. But it messes up state->repeat in some levels of JUMPs. This behavior exists since commit 29c4ba9a (2000-Aug-2), a very immature period of sre. Now fix like this: state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); + state->repeat = ctx->u.rep; RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; Then backtrack points don't need to save state->repeat to ctx->u.rep. Otherwise, if backtrack point is inside a repeat body, after a JUMP, state->repeat may points to the tail's repeat object (NULL or different repeat object). --- Modules/sre_lib.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index b23bd31bde8d27..05b4a4ff26c0d1 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -797,8 +797,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) /* <0=skip> code ... */ TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr)); LASTMARK_SAVE(); - ctx->u.rep = state->repeat; - if (ctx->u.rep) + if (state->repeat) MARK_PUSH(ctx->lastmark); for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && @@ -813,16 +812,16 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); if (ret) { - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - if (ctx->u.rep) + if (state->repeat) MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); } - if (ctx->u.rep) + if (state->repeat) MARK_POP_DISCARD(ctx->lastmark); RETURN_FAILURE; @@ -1071,8 +1070,9 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) tail matches */ state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); + state->repeat = ctx->u.rep; /* restore repeat before return */ + RETURN_ON_SUCCESS(ret); - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; RETURN_FAILURE; @@ -1110,12 +1110,12 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) /* see if the tail matches */ state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); + state->repeat = ctx->u.rep; /* restore repeat before return */ + if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - - state->repeat = ctx->u.rep; state->ptr = ctx->ptr; LASTMARK_RESTORE(); From 1043bc6cdfd623f8755c248df8293ad84665698d Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:30:04 +0800 Subject: [PATCH 04/21] 4. JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These bugs are due to the lack of protection before JUMPs. This PR will add some protections shown in below table. Legend of the table: * No: no protection * L : LASTMARK_SAVE() * MU: MARK_PUSH() unconditionally * MR: MARK_PUSH() if in a repeat unpatched patched JUMP_MAX_UNTIL_1 No No # repeat body, to min limit JUMP_MAX_UNTIL_2 L/MU L/MU # repeat body, until max limit JUMP_MAX_UNTIL_3 No No # backtrack point JUMP_MIN_UNTIL_1 No No # repeat body, to min limit JUMP_MIN_UNTIL_2 L -> L/MR # backtrack point JUMP_MIN_UNTIL_3 No No # repeat body, until max limit JUMP_REPEAT_ONE_1 L -> L/MR # backtrack point JUMP_REPEAT_ONE_2 L -> L/MR # backtrack point JUMP_MIN_REPEAT_ONE L -> L/MR # backtrack point JUMP_BRANCH L/MR L/MR # backtrack point JUMP_ASSERT No No # sub-pattern JUMP_ASSERT_NOT No -> L/MR # sub-pattern 🔼 It's clear that every backtrack points should L/MR. Commit 4/5/6 will add L/MR to backtrack points respectively. JUMP_MAX_UNTIL_3 is a backtrack point as well, but this one doesn't need L/MR protection. See this picture's explanation: https://raw.githubusercontent.com/animalize/pics/master/issues/max_until.jpg 🔼 JUMP_ASSERT_NOT's situation will be explained in Commit 7. 🔼 It seems other JUMPs don't need protect. If these JUMPs fail, either an entire match failure or can be restored by backtrack points. - JUMP_MAX_UNTIL_1 - JUMP_MIN_UNTIL_1 - JUMP_MIN_UNTIL_3 - JUMP_ASSERT I really can't find a test-case to break them. If I missed something, hope Commit 11 will catch them. 🔼 Test test-cases in this change with major regex engines: Case-A: s = 'axxzbcz' p = r'(?:(?:a|bc)*?(xx)??z)*' re.match(p, s).groups() Case-B: s = 'xtcxyzxc' p = r'((x|yz)+?(t)??c)*' re.match(p, s).groups() Case-A Case-B PHP 7.3.2 "xx" "xyzxc","x","t" Java 11.0.2 "xx" "xyzxc","x","t" Perl 5.28.1 undef [1] "xyzxc","x",undef [1] Ruby 2.6.1 "xx" "xyzxc","x","t" Go 1.12 "xx" "xyzxc","x","t" Rust 1.32.0 "xx" "xyzxc","x","t" Node.js 10.15.1 undef [1] "xyzxc","x",undef [1] regex 2019.3.12 "xx" "xyzxc","x","t" re before patch "" [2] "xyzxc","x","" [2] re after patch "xx" "xyzxc","x","t" [1] different policy, not bug. [2] "" is empty str, not None. --- Lib/test/test_re.py | 9 +++++++++ Modules/sre_lib.h | 16 ++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index ce357707ffd02d..1946f2857ed2c6 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2108,6 +2108,15 @@ def test_bug_35859(self): self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) + # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat + s = 'axxzbcz' + p = r'(?:(?:a|bc)*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + # test-case provided by issue9134 + s = 'xtcxyzxc' + p = r'((x|yz)+?(t)??c)*' + self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 05b4a4ff26c0d1..b6a3bdd0eaa004 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1105,21 +1105,29 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) RETURN_FAILURE; } - LASTMARK_SAVE(); - /* see if the tail matches */ state->repeat = ctx->u.rep->prev; + + LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); + SRE_REPEAT *repeat_of_tail = state->repeat; state->repeat = ctx->u.rep; /* restore repeat before return */ if (ret) { + if (repeat_of_tail) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - state->ptr = ctx->ptr; - + if (repeat_of_tail) + MARK_POP(ctx->lastmark); LASTMARK_RESTORE(); + state->ptr = ctx->ptr; + if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || state->ptr == ctx->u.rep->last_ptr) From 8e1407ee98f685afe69bd8478707c5f00c78c303 Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:35:37 +0800 Subject: [PATCH 05/21] 5. JUMP_REPEAT_ONE_[12] should MARK_PUSH() if in a repeat Case-A: s = 'aabaab' p = r'(?:[^b]*a(?=(b)|(a))ab)*' re.match(p, s).groups() Case-B: s = 'abab' p = r'(?:[^b]*(?=(b)|(a))ab)*' re.match(p, s).groups() Case-C: s = 'ab' p = r'(ab?)*?b' re.match(p, s).groups() Case-A Case-B Case-C PHP 7.3.2 NULL,"a" NULL,"a" "a" Java 11.0.2 "b","a" [1] "b","a" [1] "a" Perl 5.28.1 "b","a" "b","a" "a" Ruby 2.6.1 nil,"a" nil,"a" "a" Go 1.12 [2] [2] "a" Rust 1.32.0 [2] [2] "a" Node.js 10.15.1 undef,"a" undef,"a" "a" regex 2019.3.12 None,"a" None,"a" "a" re before patch "b","a" "b","a" "" [3] re after patch None,"a" None,"a" "a" [1] seems this bug: https://bugs.java.com/bugdatabase/view_bug.do?bug_id=7145888 [2] doesn't support lookaround yet. [3] "" is empty str, not None. --- Lib/test/test_re.py | 16 ++++++++++++++++ Modules/sre_lib.h | 19 ++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1946f2857ed2c6..5eeee955ed261e 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2117,6 +2117,22 @@ def test_bug_35859(self): p = r'((x|yz)+?(t)??c)*' self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) + # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat + s = 'aabaab' + p = r'(?:[^b]*a(?=(b)|(a))ab)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 6)) + self.assertEqual(m.groups(), (None, 'a')) + + # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat + s = 'abab' + p = r'(?:[^b]*(?=(b)|(a))ab)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None, 'a')) + + self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index b6a3bdd0eaa004..2cdee860d96dbb 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -867,6 +867,8 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) } LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { /* tail starts with a literal. skip positions where @@ -884,16 +886,20 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); LASTMARK_RESTORE(); ctx->ptr--; ctx->count--; } - + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } else { /* general case */ while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { @@ -901,13 +907,20 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + ctx->ptr--; ctx->count--; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; From 5772bb1be8b0a74c60452fbe23165d92c9b53eee Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:39:42 +0800 Subject: [PATCH 06/21] 6. JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat Case-A: s = 'abab' p = r'(?:.*?(?=(a)|(b))b)*' re.match(p, s).groups() Case-B: s = 'axxzaz' p = r'(?:a*?(xx)??z)*' re.match(p, s).groups() Case-A Case-B PHP 7.3.2 NULL,"b" "xx" Java 11.0.2 "a","b" [1] "xx" Perl 5.28.1 "a","b" undef [2] Ruby 2.6.1 nil,"b" "xx" Go 1.12 [3] "xx" Rust 1.32.0 [3] "xx" Node.js 10.15.1 undef,"b" undef regex 2019.3.12 None,"b" "xx" re before patch "a","b" "" [4] re after patch None,"b" "xx" [1] seems this bug: https://bugs.java.com/bugdatabase/view_bug.do?bug_id=7145888 [2] different policy, not bug. [3] doesn't support lookaround yet. [4] "" is empty str, not None. --- Lib/test/test_re.py | 11 +++++++++++ Modules/sre_lib.h | 12 +++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 5eeee955ed261e..1074bfe1ce1140 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2133,6 +2133,17 @@ def test_bug_35859(self): self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) + # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat + s = 'abab' + p = r'(?:.*?(?=(a)|(b))b)*' + m = re.match(p, s) + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), (None, 'b')) + + s = 'axxzaz' + p = r'(?:a*?(xx)??z)*' + self.assertEqual(re.match(p, s).groups(), ('xx',)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 2cdee860d96dbb..0cfb1741fa1f33 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -969,15 +969,24 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) } else { /* general case */ LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { state->ptr = ctx->ptr; DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, ctx->pattern+ctx->pattern[0]); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_SUCCESS; } + if (state->repeat) + MARK_POP_KEEP(ctx->lastmark); + LASTMARK_RESTORE(); + state->ptr = ctx->ptr; ret = SRE(count)(state, ctx->pattern+3, 1); RETURN_ON_ERROR(ret); @@ -987,8 +996,9 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) assert(ret == 1); ctx->ptr++; ctx->count++; - LASTMARK_RESTORE(); } + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); } RETURN_FAILURE; From 738ed31e376ead431c565a65bff5ff151b80c522 Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:42:00 +0800 Subject: [PATCH 07/21] 7. JUMP_ASSERT_NOT should LASTMARK_SAVE() JUMP_ASSERT_NOT is special, when the sub-pattern fails, the match succeeds. Here is a detailed explanation came from Greg Chapman: > Greg Chapman wrote in issue725149's patch: > > For negative assertions, if the subpattern fails, the assertion > succeeds and we want to continue matching. However, in failing, > the subpattern may have left any capturing groups it contains > in an inconsistent (partially matched) state. For consistency, > "unmark" all groups in the subpattern after it completes by > restoring marks to the state they had before executing the > subpattern. When not in a repeat, it is sufficient simply to > restore lastmark to its previous value (since a repeat is the > only way to move from right to left in a pattern, without a repeat > marks in the assertion will be greater than any so far > encountered. This bug was initially reported in issue725149. Case-A: re.match(r'(?!(..)c)', 'ab').groups() Case-B: re.match(r'(?:(?!(ab)c).)*', 'ab').groups() Case-A Case-B PHP 7.3.2 NULL NULL Java 11.0.2 null null Perl 5.28.1 "ab" "ab" Ruby 2.6.1 nil nil Go 1.12 [1] [1] Rust 1.32.0 [1] [1] Node.js 10.15.1 undef undef regex 2019.3.12 None None re before patch "ab" "b" re after patch None None [1] doesn't support lookaround yet. --- Lib/test/test_re.py | 4 ++++ Modules/sre_lib.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1074bfe1ce1140..3f19a3a67612c0 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2144,6 +2144,10 @@ def test_bug_35859(self): p = r'(?:a*?(xx)??z)*' self.assertEqual(re.match(p, s).groups(), ('xx',)) + # JUMP_ASSERT_NOT should LASTMARK_SAVE() + # reported in issue725149 + self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 0cfb1741fa1f33..567bcbe84d3e37 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1316,11 +1316,13 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) ctx->ptr, ctx->pattern[1])); if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; + LASTMARK_SAVE(); DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { RETURN_ON_ERROR(ret); RETURN_FAILURE; } + LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; break; From abde0e99a8f38438ef972f961d1dd4e33a6c3001 Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:44:16 +0800 Subject: [PATCH 08/21] 8. JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat re.match(r'((?!(ab)c)(.))*', 'abab').groups() PHP 7.3.2 "b",NULL,"b" Java 11.0.2 "b",null,"b" Perl 5.28.1 "b","ab","b" Ruby 2.6.1 "b",nil,"b" Go 1.12 [1] Rust 1.32.0 [1] Node.js 10.15.1 "b",undef,"b" regex 2019.3.12 "b",None,"b" re before patch "b","b","b" re after patch "b",None,"b" [1] doesn't support lookaround yet. --- Lib/test/test_re.py | 5 +++++ Modules/sre_lib.h | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 3f19a3a67612c0..8c9d85f727176d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2148,6 +2148,11 @@ def test_bug_35859(self): # reported in issue725149 self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) + # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat + m = re.match(r'((?!(ab)c)(.))*', 'abab') + self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.groups(), ('b', None, 'b')) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 567bcbe84d3e37..92bfa2d4dd7d9e 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1317,11 +1317,18 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { state->ptr = ctx->ptr - ctx->pattern[1]; LASTMARK_SAVE(); + if (state->repeat) + MARK_PUSH(ctx->lastmark); + DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { + if (state->repeat) + MARK_POP_DISCARD(ctx->lastmark); RETURN_ON_ERROR(ret); RETURN_FAILURE; } + if (state->repeat) + MARK_POP(ctx->lastmark); LASTMARK_RESTORE(); } ctx->pattern += ctx->pattern[0]; From 32b97979e0f4269915c614c74bac924bb738174b Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 08:50:18 +0800 Subject: [PATCH 09/21] 9. reduce sizeof(match_context) On 32 bit platform, 36 bytes -> 32 bytes. On 64 bit platform, 72 bytes -> 64 bytes. ctx->toplevel is used as a boolean type. ctx->jump's possible value is below: #define JUMP_NONE 0 #define JUMP_MAX_UNTIL_1 1 #define JUMP_MAX_UNTIL_2 2 #define JUMP_MAX_UNTIL_3 3 #define JUMP_MIN_UNTIL_1 4 #define JUMP_MIN_UNTIL_2 5 #define JUMP_MIN_UNTIL_3 6 #define JUMP_REPEAT 7 #define JUMP_REPEAT_ONE_1 8 #define JUMP_REPEAT_ONE_2 9 #define JUMP_MIN_REPEAT_ONE 10 #define JUMP_BRANCH 11 #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 --- Modules/sre_lib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 92bfa2d4dd7d9e..6ac413b3338d9a 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -529,8 +529,6 @@ do { \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) typedef struct { - Py_ssize_t last_ctx_pos; - Py_ssize_t jump; SRE_CHAR* ptr; SRE_CODE* pattern; Py_ssize_t count; @@ -540,7 +538,9 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; - int toplevel; + Py_ssize_t last_ctx_pos; + int8_t jump; + int8_t toplevel; } SRE(match_context); /* check if string matches the given pattern. returns <0 for @@ -551,8 +551,8 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t i, ret = 0; - Py_ssize_t jump; unsigned int sigcount=0; + int8_t jump; SRE(match_context)* ctx; SRE(match_context)* nextctx; From 7ff501b4b0ce47a9868274d1740d280bd528259d Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 09:02:20 +0800 Subject: [PATCH 10/21] 10. limit max group to 1,073,741,823 1,073,741,823 groups should enough for most users. This change reduces sizeof(match_context) again: - On 32 bit platform: 32 bytes, no change. - On 64 bit platform: 64 bytes -> 56 bytes. sre uses stack and match_context struct to simulate recursive call, smaller struct brings: - deeper recursive call - less memory consume - less memory realloc Here is a test, if limit the stack size (state->data_stack_base) to 1 GiB, the max available value of n is: re.match(r'(ab)*', n * 'ab') # need to save MARKs 72 bytes: n = 11,184,809 64 bytes: n = 12,201,610 56 bytes: n = 13,421,771 re.match(r'(?:ab)*', n * 'ab') # no need to save MARKs 72 bytes: n = 13,421,770 64 bytes: n = 14,913,079 56 bytes: n = 16,777,214 --- Modules/sre.h | 14 ++++++++------ Modules/sre_lib.h | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Modules/sre.h b/Modules/sre.h index a7284881457c3b..08bddf6751e10d 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -16,12 +16,14 @@ /* size of a code word (must be unsigned short or larger, and large enough to hold a UCS4 character) */ #define SRE_CODE Py_UCS4 + +/* SRE_MAXGROUPS is 1,073,741,823 */ +#define SRE_MAXGROUPS INT_MAX / 2 + #if SIZEOF_SIZE_T > 4 # define SRE_MAXREPEAT (~(SRE_CODE)0) -# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2) #else # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) -# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2) #endif typedef struct { @@ -71,9 +73,11 @@ typedef struct { Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ + /* current repeat context */ + SRE_REPEAT *repeat; /* registers */ - Py_ssize_t lastindex; - Py_ssize_t lastmark; + int32_t lastmark; + int32_t lastindex; void** mark; int match_all; int must_advance; @@ -81,8 +85,6 @@ typedef struct { char* data_stack; size_t data_stack_size; size_t data_stack_base; - /* current repeat context */ - SRE_REPEAT *repeat; } SRE_STATE; typedef struct { diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 6ac413b3338d9a..05dd6b14ddb48a 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -532,8 +532,8 @@ typedef struct { SRE_CHAR* ptr; SRE_CODE* pattern; Py_ssize_t count; - Py_ssize_t lastmark; - Py_ssize_t lastindex; + int32_t lastmark; + int32_t lastindex; union { SRE_CODE chr; SRE_REPEAT* rep; @@ -550,7 +550,8 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) { SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; - Py_ssize_t i, ret = 0; + Py_ssize_t ret = 0; + int32_t i; unsigned int sigcount=0; int8_t jump; From 2b359f3f4df9a741b23322ae3446b7cddc56421f Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 09:05:41 +0800 Subject: [PATCH 11/21] 11. raise RuntimeError if the span of capturing group is wrong These bugs about capturing group, sometimes lead to wrong span (begin > end): Before the fix: >>> re.match(r'(.b|a)*?b', 'ab').span(1) (2, 1) >>> re.match(r'(?:(?:a|bc)*?(xx)??z)*', 'axxzbcz').span(1) (4, 3) When (begin > end), PyUnicode_Substring() returns an empty string silently. This change is a safety-check, maybe it can reveal undiscovered bugs. Check at two moments to avoid redundant check: - getting a capture group from state object directly - creating a Match object --- Modules/_sre.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Modules/_sre.c b/Modules/_sre.c index 5cea7562f2807a..57d4af9d594ff0 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -511,6 +511,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) } else { i = STATE_OFFSET(state, state->mark[index]); j = STATE_OFFSET(state, state->mark[index+1]); + + /* check wrong span */ + if (i > j) { + PyErr_SetString(PyExc_RuntimeError, + "the span of capturing group is wrong," + " please report a bug."); + return NULL; + } } return getslice(state->isbytes, state->beginning, string, i, j); @@ -2354,6 +2362,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status) if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) { match->mark[j+2] = ((char*) state->mark[j] - base) / n; match->mark[j+3] = ((char*) state->mark[j+1] - base) / n; + + /* check wrong span */ + if (match->mark[j+2] > match->mark[j+3]) { + PyErr_SetString(PyExc_RuntimeError, + "the span of capturing group is wrong," + " please report a bug."); + Py_DECREF(match); + return NULL; + } } else match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ From 3ae8e696db1de2cca63557de46ae09ff608ebe97 Mon Sep 17 00:00:00 2001 From: animalize Date: Thu, 14 Mar 2019 09:07:27 +0800 Subject: [PATCH 12/21] 12. clean error process code 1. Remove error code SRE_ERROR_RECURSION_LIMIT. This code has not been used since commit 2cbdc2a4 (2003-Dec-14). 2. Return SRE_ERROR_MEMORY immediately when allocate memory fail. pattern_error() function will process SRE_ERROR_MEMORY properly. (tested) --- Modules/_sre.c | 8 -------- Modules/sre_lib.h | 7 +++---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 57d4af9d594ff0..8d2d38680f75e3 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -73,7 +73,6 @@ static const char copyright[] = /* error codes */ #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ #define SRE_ERROR_STATE -2 /* illegal state */ -#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ #define SRE_ERROR_MEMORY -9 /* out of memory */ #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ @@ -528,13 +527,6 @@ static void pattern_error(Py_ssize_t status) { switch (status) { - case SRE_ERROR_RECURSION_LIMIT: - /* This error code seems to be unused. */ - PyErr_SetString( - PyExc_RecursionError, - "maximum recursion limit exceeded" - ); - break; case SRE_ERROR_MEMORY: PyErr_NoMemory(); break; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 05dd6b14ddb48a..ababbe3bb8a9c9 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1012,10 +1012,9 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) /* install new repeat context */ ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } + if (!ctx->u.rep) + RETURN_ERROR(SRE_ERROR_MEMORY); + ctx->u.rep->count = -1; ctx->u.rep->pattern = ctx->pattern; ctx->u.rep->prev = state->repeat; From c84d7ed3a70543f26f73fba982a438d620722090 Mon Sep 17 00:00:00 2001 From: animalize Date: Fri, 1 May 2020 15:27:07 +0800 Subject: [PATCH 13/21] Address review comments --- Modules/_sre.c | 12 ++++++------ Modules/sre.h | 2 +- Modules/sre_lib.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 72c2175ad7f604..3e0fccc917d099 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -513,9 +513,9 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty) /* check wrong span */ if (i > j) { - PyErr_SetString(PyExc_RuntimeError, - "the span of capturing group is wrong," - " please report a bug."); + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); return NULL; } } @@ -2356,9 +2356,9 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status) /* check wrong span */ if (match->mark[j+2] > match->mark[j+3]) { - PyErr_SetString(PyExc_RuntimeError, - "the span of capturing group is wrong," - " please report a bug."); + PyErr_SetString(PyExc_SystemError, + "The span of capturing group is wrong," + " please report a bug for the re module."); Py_DECREF(match); return NULL; } diff --git a/Modules/sre.h b/Modules/sre.h index 1d263b0fb438d6..267038df7bb37a 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -18,7 +18,7 @@ #define SRE_CODE Py_UCS4 /* SRE_MAXGROUPS is 1,073,741,823 */ -#define SRE_MAXGROUPS INT_MAX / 2 +#define SRE_MAXGROUPS (INT_MAX / 2) #if SIZEOF_SIZE_T > 4 # define SRE_MAXREPEAT (~(SRE_CODE)0) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index e48c6497dac6bd..6d61ed67b50d8a 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1093,7 +1093,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) tail matches */ state->repeat = ctx->u.rep->prev; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); - state->repeat = ctx->u.rep; /* restore repeat before return */ + state->repeat = ctx->u.rep; // restore repeat before return RETURN_ON_SUCCESS(ret); state->ptr = ctx->ptr; @@ -1137,7 +1137,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); SRE_REPEAT *repeat_of_tail = state->repeat; - state->repeat = ctx->u.rep; /* restore repeat before return */ + state->repeat = ctx->u.rep; // restore repeat before return if (ret) { if (repeat_of_tail) From d671c44a95ed127047ba3b7873e463ddd737b49b Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 21 Mar 2022 20:39:25 +0800 Subject: [PATCH 14/21] fix NEWS file --- .../next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst index fa27434a62862c..8c88ef01164e46 100644 --- a/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst +++ b/Misc/NEWS.d/next/Library/2019-03-14-09-08-25.bpo-35859.8lFdLe.rst @@ -1,2 +1,2 @@ -re module, fix a few bugs about capturing group. In rare cases, capturing -group gets an incorrect string. Patch by Ma Lin. \ No newline at end of file +:mod:`re` module, fix a few bugs about capturing group. In rare cases, +capturing group gets an incorrect string. Patch by Ma Lin. From ccbdb92772ad9ab0105c4258859cc4936e8fd0b4 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 21 Mar 2022 21:29:42 +0800 Subject: [PATCH 15/21] Revert "10. limit max group to 1,073,741,823" This reverts commit 7ff501b4b0ce47a9868274d1740d280bd528259d. --- Modules/sre.h | 14 ++++++-------- Modules/sre_lib.h | 7 +++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/Modules/sre.h b/Modules/sre.h index 94dd0a8614f6bf..785adbd003e7fd 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -16,14 +16,12 @@ /* size of a code word (must be unsigned short or larger, and large enough to hold a UCS4 character) */ #define SRE_CODE Py_UCS4 - -/* SRE_MAXGROUPS is 1,073,741,823 */ -#define SRE_MAXGROUPS (INT_MAX / 2) - #if SIZEOF_SIZE_T > 4 # define SRE_MAXREPEAT (~(SRE_CODE)0) +# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2) #else # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) +# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2) #endif typedef struct { @@ -73,11 +71,9 @@ typedef struct { Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ - /* current repeat context */ - SRE_REPEAT *repeat; /* registers */ - int32_t lastmark; - int32_t lastindex; + Py_ssize_t lastindex; + Py_ssize_t lastmark; const void** mark; int match_all; int must_advance; @@ -85,6 +81,8 @@ typedef struct { char* data_stack; size_t data_stack_size; size_t data_stack_base; + /* current repeat context */ + SRE_REPEAT *repeat; } SRE_STATE; typedef struct { diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index c974d3bbdb1ea1..8ca057b3334dfe 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -503,8 +503,8 @@ typedef struct { const SRE_CHAR* ptr; const SRE_CODE* pattern; Py_ssize_t count; - int32_t lastmark; - int32_t lastindex; + Py_ssize_t lastmark; + Py_ssize_t lastindex; union { SRE_CODE chr; SRE_REPEAT* rep; @@ -521,8 +521,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) { const SRE_CHAR* end = (const SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; - Py_ssize_t ret = 0; - int32_t i; + Py_ssize_t i, ret = 0; unsigned int sigcount=0; int8_t jump; From 18bb63bf40d6a96946d92e112e53fef9c47902a7 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 21 Mar 2022 21:33:16 +0800 Subject: [PATCH 16/21] Revert "9. reduce sizeof(match_context)" This reverts commit 32b97979e0f4269915c614c74bac924bb738174b. --- Modules/sre_lib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 8ca057b3334dfe..3ba18117a8f1d6 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -500,6 +500,8 @@ do { \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) typedef struct { + Py_ssize_t last_ctx_pos; + Py_ssize_t jump; const SRE_CHAR* ptr; const SRE_CODE* pattern; Py_ssize_t count; @@ -509,9 +511,7 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; - Py_ssize_t last_ctx_pos; - int8_t jump; - int8_t toplevel; + int toplevel; } SRE(match_context); /* check if string matches the given pattern. returns <0 for @@ -522,8 +522,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) const SRE_CHAR* end = (const SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t i, ret = 0; + Py_ssize_t jump; unsigned int sigcount=0; - int8_t jump; SRE(match_context)* ctx; SRE(match_context)* nextctx; From e084e647c92ed1639bbb37c614a93bccaafb573d Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 21 Mar 2022 23:34:05 +0800 Subject: [PATCH 17/21] split unit-tests --- Lib/test/test_re.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1e337e432dfbf4..2e62bc77028e43 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2066,14 +2066,16 @@ def test_bug_34294(self): {'tag': 'foo', 'text': None}, {'tag': 'foo', 'text': None}]) - def test_bug_35859(self): - # MARK_PUSH() macro didn't protect MARK-0 - # if it was the only available mark. + def test_MARK_PUSH_macro_bug(self): + # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it + # was the only available mark. self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',)) self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',)) self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',)) self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',)) + def test_MIN_UNTIL_mark_bug(self): + # Fixed in issue35859, reported in issue9134. # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat s = 'axxzbcz' p = r'(?:(?:a|bc)*?(xx)??z)*' @@ -2083,6 +2085,8 @@ def test_bug_35859(self): p = r'((x|yz)+?(t)??c)*' self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) + def test_REPEAT_ONE_mark_bug(self): + # issue35859 # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat s = 'aabaab' p = r'(?:[^b]*a(?=(b)|(a))ab)*' @@ -2099,6 +2103,8 @@ def test_bug_35859(self): self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) + def test_MIN_REPEAT_ONE_mark_bug(self): + # issue35859 # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat s = 'abab' p = r'(?:.*?(?=(a)|(b))b)*' @@ -2110,8 +2116,9 @@ def test_bug_35859(self): p = r'(?:a*?(xx)??z)*' self.assertEqual(re.match(p, s).groups(), ('xx',)) + def test_ASSERT_NOT_mark_bug(self): + # Fixed in issue35859, reported in issue725149. # JUMP_ASSERT_NOT should LASTMARK_SAVE() - # reported in issue725149 self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,)) # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat From 94a4fdc5dec8ea8362684f94f2bf84107f88d0cc Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 21 Mar 2022 23:48:06 +0800 Subject: [PATCH 18/21] Revert "1. add asserts" This reverts commit f738245ad94307f126e0f8985c66a7aad508ad3d. --- Modules/sre_lib.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 3ba18117a8f1d6..6efdf740ddb7ce 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1319,15 +1319,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) ctx_pos = ctx->last_ctx_pos; jump = ctx->jump; DATA_POP_DISCARD(ctx); - if (ctx_pos == -1) { -#ifdef Py_DEBUG - if (ctx->toplevel && ret >= 0) { - assert(state->repeat == NULL); - assert(state->data_stack_base == 0); - } -#endif + if (ctx_pos == -1) return ret; - } DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); switch (jump) { @@ -1376,8 +1369,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) break; } - /* should never get here */ - Py_UNREACHABLE(); + return ret; /* should never get here */ } /* need to reset capturing groups between two SRE(match) callings in loops */ From 49d00d20e838409d8a7156528c007dfa50977454 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Tue, 22 Mar 2022 13:42:50 +0800 Subject: [PATCH 19/21] add .span(group_num) to unit-tests --- Lib/test/test_re.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 2e62bc77028e43..27d42f589c4579 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2080,10 +2080,14 @@ def test_MIN_UNTIL_mark_bug(self): s = 'axxzbcz' p = r'(?:(?:a|bc)*?(xx)??z)*' self.assertEqual(re.match(p, s).groups(), ('xx',)) + # test-case provided by issue9134 s = 'xtcxyzxc' p = r'((x|yz)+?(t)??c)*' - self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't')) + m = re.match(p, s) + self.assertEqual(m.span(), (0, 8)) + self.assertEqual(m.span(2), (6, 7)) + self.assertEqual(m.groups(), ('xyzxc', 'x', 't')) def test_REPEAT_ONE_mark_bug(self): # issue35859 @@ -2092,6 +2096,7 @@ def test_REPEAT_ONE_mark_bug(self): p = r'(?:[^b]*a(?=(b)|(a))ab)*' m = re.match(p, s) self.assertEqual(m.span(), (0, 6)) + self.assertEqual(m.span(2), (4, 5)) self.assertEqual(m.groups(), (None, 'a')) # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat @@ -2099,6 +2104,7 @@ def test_REPEAT_ONE_mark_bug(self): p = r'(?:[^b]*(?=(b)|(a))ab)*' m = re.match(p, s) self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(2), (2, 3)) self.assertEqual(m.groups(), (None, 'a')) self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',)) @@ -2110,6 +2116,7 @@ def test_MIN_REPEAT_ONE_mark_bug(self): p = r'(?:.*?(?=(a)|(b))b)*' m = re.match(p, s) self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(2), (3, 4)) self.assertEqual(m.groups(), (None, 'b')) s = 'axxzaz' @@ -2124,6 +2131,8 @@ def test_ASSERT_NOT_mark_bug(self): # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat m = re.match(r'((?!(ab)c)(.))*', 'abab') self.assertEqual(m.span(), (0, 4)) + self.assertEqual(m.span(1), (3, 4)) + self.assertEqual(m.span(3), (3, 4)) self.assertEqual(m.groups(), ('b', None, 'b')) def test_bug_40736(self): From 785d9bfa2a2f1f7c011e99d440173cdf697c87ac Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 28 Mar 2022 17:48:56 +0800 Subject: [PATCH 20/21] Revert "12. clean error process code" --- Modules/_sre.c | 8 ++++++++ Modules/sre_lib.h | 7 ++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index 04aa16b188e509..cef57e98f8bca2 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -75,6 +75,7 @@ static const char copyright[] = /* error codes */ #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ #define SRE_ERROR_STATE -2 /* illegal state */ +#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */ #define SRE_ERROR_MEMORY -9 /* out of memory */ #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */ @@ -548,6 +549,13 @@ static void pattern_error(Py_ssize_t status) { switch (status) { + case SRE_ERROR_RECURSION_LIMIT: + /* This error code seems to be unused. */ + PyErr_SetString( + PyExc_RecursionError, + "maximum recursion limit exceeded" + ); + break; case SRE_ERROR_MEMORY: PyErr_NoMemory(); break; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 6efdf740ddb7ce..dd414e795287e8 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -981,9 +981,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* install new repeat context */ ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) - RETURN_ERROR(SRE_ERROR_MEMORY); - + if (!ctx->u.rep) { + PyErr_NoMemory(); + RETURN_FAILURE; + } ctx->u.rep->count = -1; ctx->u.rep->pattern = ctx->pattern; ctx->u.rep->prev = state->repeat; From 3fe33a859e96e696f99fd2173abd9cadd869a3e2 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Tue, 29 Mar 2022 20:07:12 +0800 Subject: [PATCH 21/21] Porting to Python 3.11 --- Doc/whatsnew/3.11.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index b7e9dc6e9e37f8..6b37e64cbaf987 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -693,6 +693,11 @@ Changes in the Python API deprecated since Python 3.6. (Contributed by Serhiy Storchaka in :issue:`47066`.) +* :mod:`re` module: Fix a few long-standing bugs where, in rare cases, + capturing group could get wrong result. So the result may be different than + before. + (Contributed by Ma Lin in :issue:`35859`.) + Build Changes =============