Skip to content

bpo-35859: fix bugs in re engine #12427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from Mar 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f738245
1. add asserts
wjssz Mar 8, 2019
0d1e658
2. macro MARK_PUSH() bug
wjssz Mar 8, 2019
cbc10fc
3. restore state->repeat at the first opportunity
wjssz Mar 14, 2019
1043bc6
4. JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
8e1407e
5. JUMP_REPEAT_ONE_[12] should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
5772bb1
6. JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
738ed31
7. JUMP_ASSERT_NOT should LASTMARK_SAVE()
wjssz Mar 14, 2019
abde0e9
8. JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
32b9797
9. reduce sizeof(match_context)
wjssz Mar 14, 2019
7ff501b
10. limit max group to 1,073,741,823
wjssz Mar 14, 2019
2b359f3
11. raise RuntimeError if the span of capturing group is wrong
wjssz Mar 14, 2019
3ae8e69
12. clean error process code
wjssz Mar 14, 2019
5b3e43d
Merge branch 'master' into Solution_A_repeat
animalize Apr 13, 2020
c84d7ed
Address review comments
wjssz May 1, 2020
b00c6e3
Merge branch 'main' into Solution_A_repeat
serhiy-storchaka Mar 21, 2022
d671c44
fix NEWS file
wjssz Mar 21, 2022
ccbdb92
Revert "10. limit max group to 1,073,741,823"
wjssz Mar 21, 2022
18bb63b
Revert "9. reduce sizeof(match_context)"
wjssz Mar 21, 2022
e084e64
split unit-tests
wjssz Mar 21, 2022
94a4fdc
Revert "1. add asserts"
wjssz Mar 21, 2022
49d00d2
add .span(group_num) to unit-tests
wjssz Mar 22, 2022
785d9bf
Revert "12. clean error process code"
wjssz Mar 28, 2022
3fe33a8
Porting to Python 3.11
wjssz Mar 29, 2022
d6c07b6
Merge branch 'main' into Solution_A_repeat
animalize Mar 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,11 @@ Changes in the Python API
deprecated since Python 3.6.
(Contributed by Serhiy Storchaka in :issue:`47066`.)

* :mod:`re` module: Fix a few long-standing bugs where, in rare cases,
capturing group could get wrong result. So the result may be different than
before.
(Contributed by Ma Lin in :issue:`35859`.)

* The *population* parameter of :func:`random.sample` must be a sequence.
Automatic conversion of sets to lists is no longer supported. If the sample size
is larger than the population size, a :exc:`ValueError` is raised.
Expand Down
69 changes: 69 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,6 +2033,75 @@ def test_bug_34294(self):
{'tag': 'foo', 'text': None},
{'tag': 'foo', 'text': None}])

def test_MARK_PUSH_macro_bug(self):
# issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
# was the only available mark.
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))

def test_MIN_UNTIL_mark_bug(self):
# Fixed in issue35859, reported in issue9134.
# JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
s = 'axxzbcz'
p = r'(?:(?:a|bc)*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))

# test-case provided by issue9134
s = 'xtcxyzxc'
p = r'((x|yz)+?(t)??c)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 8))
self.assertEqual(m.span(2), (6, 7))
self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))

def test_REPEAT_ONE_mark_bug(self):
# issue35859
# JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
s = 'aabaab'
p = r'(?:[^b]*a(?=(b)|(a))ab)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 6))
self.assertEqual(m.span(2), (4, 5))
self.assertEqual(m.groups(), (None, 'a'))

# JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
s = 'abab'
p = r'(?:[^b]*(?=(b)|(a))ab)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.span(2), (2, 3))
self.assertEqual(m.groups(), (None, 'a'))

self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))

def test_MIN_REPEAT_ONE_mark_bug(self):
# issue35859
# JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
s = 'abab'
p = r'(?:.*?(?=(a)|(b))b)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.span(2), (3, 4))
self.assertEqual(m.groups(), (None, 'b'))

s = 'axxzaz'
p = r'(?:a*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))

def test_ASSERT_NOT_mark_bug(self):
# Fixed in issue35859, reported in issue725149.
# JUMP_ASSERT_NOT should LASTMARK_SAVE()
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))

# JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
m = re.match(r'((?!(ab)c)(.))*', 'abab')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.span(1), (3, 4))
self.assertEqual(m.span(3), (3, 4))
self.assertEqual(m.groups(), ('b', None, 'b'))

def test_bug_40736(self):
with self.assertRaisesRegex(TypeError, "got 'int'"):
re.search("x*", 5)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:mod:`re` module, fix a few bugs about capturing group. In rare cases,
capturing group gets an incorrect string. Patch by Ma Lin.
17 changes: 17 additions & 0 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
} else {
i = STATE_OFFSET(state, state->mark[index]);
j = STATE_OFFSET(state, state->mark[index+1]);

/* check wrong span */
if (i > j) {
PyErr_SetString(PyExc_SystemError,
"The span of capturing group is wrong,"
" please report a bug for the re module.");
return NULL;
}
}

return getslice(state->isbytes, state->beginning, string, i, j);
Expand Down Expand Up @@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state,
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;

/* check wrong span */
if (match->mark[j+2] > match->mark[j+3]) {
PyErr_SetString(PyExc_SystemError,
"The span of capturing group is wrong,"
" please report a bug for the re module.");
Py_DECREF(match);
return NULL;
}
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */

Expand Down
78 changes: 59 additions & 19 deletions Modules/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,20 +449,20 @@ do { \
DATA_STACK_LOOKUP_AT(state,t,p,pos)

#define MARK_PUSH(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
} while (0)
#define MARK_POP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
} while (0)
#define MARK_POP_KEEP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
} while (0)
#define MARK_POP_DISCARD(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
} while (0)

Expand Down Expand Up @@ -770,8 +770,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
LASTMARK_SAVE();
ctx->u.rep = state->repeat;
if (ctx->u.rep)
if (state->repeat)
MARK_PUSH(ctx->lastmark);
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
if (ctx->pattern[1] == SRE_OP_LITERAL &&
Expand All @@ -786,16 +785,16 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
state->ptr = ctx->ptr;
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
if (ret) {
if (ctx->u.rep)
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (ctx->u.rep)
if (state->repeat)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();
}
if (ctx->u.rep)
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_FAILURE;

Expand Down Expand Up @@ -841,6 +840,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
}

LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);

if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where
Expand All @@ -858,30 +859,41 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
ctx->pattern+ctx->pattern[0]);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}

if (state->repeat)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();

ctx->ptr--;
ctx->count--;
}

if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
} else {
/* general case */
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
state->ptr = ctx->ptr;
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
ctx->pattern+ctx->pattern[0]);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (state->repeat)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();

ctx->ptr--;
ctx->count--;
LASTMARK_RESTORE();
}
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
}
RETURN_FAILURE;

Expand Down Expand Up @@ -930,15 +942,24 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
} else {
/* general case */
LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);

while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
state->ptr = ctx->ptr;
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
ctx->pattern+ctx->pattern[0]);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (state->repeat)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE();

state->ptr = ctx->ptr;
ret = SRE(count)(state, ctx->pattern+3, 1);
RETURN_ON_ERROR(ret);
Expand All @@ -948,8 +969,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
assert(ret == 1);
ctx->ptr++;
ctx->count++;
LASTMARK_RESTORE();
}
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
}
RETURN_FAILURE;

Expand Down Expand Up @@ -1098,8 +1120,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
tail matches */
state->repeat = ctx->u.rep->prev;
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
state->repeat = ctx->u.rep; // restore repeat before return

RETURN_ON_SUCCESS(ret);
state->repeat = ctx->u.rep;
state->ptr = ctx->ptr;
RETURN_FAILURE;

Expand Down Expand Up @@ -1132,21 +1155,29 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
RETURN_FAILURE;
}

LASTMARK_SAVE();

/* see if the tail matches */
state->repeat = ctx->u.rep->prev;

LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);

DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
SRE_REPEAT *repeat_of_tail = state->repeat;
state->repeat = ctx->u.rep; // restore repeat before return

if (ret) {
if (repeat_of_tail)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (repeat_of_tail)
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();

state->repeat = ctx->u.rep;
state->ptr = ctx->ptr;

LASTMARK_RESTORE();

if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
&& ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
state->ptr == ctx->u.rep->last_ptr)
Expand Down Expand Up @@ -1444,11 +1475,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
ctx->ptr, ctx->pattern[1]));
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
state->ptr = ctx->ptr - ctx->pattern[1];
LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);

DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_FAILURE;
}
if (state->repeat)
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();
}
ctx->pattern += ctx->pattern[0];
break;
Expand Down