From a0093c06d08275e8f49441c94dfdb42969ce5008 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 19 Apr 2022 14:43:01 +0300 Subject: [PATCH 1/5] gh-91760: Deprecate group names and numbers which will be invalid in future Only sequence of ASCII digits not starting with 0 (except group 0) will be accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore. --- Doc/library/re.rst | 11 +++ Doc/whatsnew/3.11.rst | 9 +++ Lib/re/_parser.py | 67 +++++++++++++------ Lib/test/test_re.py | 64 ++++++++++++++++++ ...2-04-21-19-46-03.gh-issue-91760.zDtv1E.rst | 4 ++ 5 files changed, 136 insertions(+), 19 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 89de9286ace79c..70d96799386e83 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -417,6 +417,9 @@ The special characters are: | | * ``\1`` | +---------------------------------------+----------------------------------+ + .. deprecated:: 3.11 + Group names containing non-ASCII characters in bytes patterns. + .. index:: single: (?P=; in regular expressions ``(?P=name)`` @@ -486,6 +489,9 @@ The special characters are: will match with ``''`` as well as ``'user@host.com'``, but not with ``''``. + .. deprecated:: 3.11 + Group *id* containing anything except ASCII digits or starting with ``0``. + The special sequences consist of ``'\'`` and a character from the list below. If the ordinary character is not an ASCII digit or an ASCII letter, then the @@ -995,6 +1001,11 @@ form. Empty matches for the pattern are replaced when adjacent to a previous non-empty match. + .. deprecated:: 3.11 + Group *id* containing anything except ASCII digits or starting with ``0`` + (except group 0). + Group names containing non-ASCII characters in bytes replacement strings. + .. function:: subn(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 6540a255a0ed82..5d4e2b4bf9dfac 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -922,6 +922,15 @@ Deprecated (Contributed by Brett Cannon in :issue:`47061`.) +* More strict rules will be applied now applied for numerical group references + and group names in regular expressions in future Python versions. + Only sequence of ASCII digits not starting with ``0`` (except group 0) will be + now accepted as a numerical reference. + The group name in bytes patterns and replacement strings could only + contain ASCII letters and digits and underscore. + For now, a deprecation warning is raised for such syntax. + (Contributed by Serhiy Storchaka in :issue:`91760`.) + Removed ======= diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index f191f809a1491e..3608802e06a161 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -293,8 +293,22 @@ def seek(self, index): self.__next() def error(self, msg, offset=0): + if not self.istext: + msg = msg.encode('ascii', 'backslashreplace').decode('ascii') return error(msg, self.string, self.tell() - offset) + def checkgroupname(self, name, offset, nested): + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + if not (self.istext or name.isascii()): + import warnings + warnings.warn( + "bad character in group name %a at position %d" % + (name, self.tell() - len(name) - offset), + DeprecationWarning, stacklevel=nested + 7 + ) + def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) @@ -707,15 +721,11 @@ def _parse(source, state, verbose, nested, first=False): if sourcematch("<"): # named group: skip forward to end of name name = source.getuntil(">", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1, nested) elif sourcematch("="): # named backreference name = source.getuntil(")", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) + source.checkgroupname(name, 1, nested) gid = state.groupdict.get(name) if gid is None: msg = "unknown group name %r" % name @@ -776,12 +786,7 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if condname.isidentifier(): - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) - else: + if not condname.isidentifier(): try: condgroup = int(condname) if condgroup < 0: @@ -795,6 +800,21 @@ def _parse(source, state, verbose, nested, first=False): if condgroup >= MAXGROUPS: msg = "invalid group reference %d" % condgroup raise source.error(msg, len(condname) + 1) + if not (condname.isdecimal() and condname.isascii() and + (condname[0] != "0" or condname == "0")): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(condname) if source.istext else ascii(condname), + source.tell() - len(condname) - 1), + DeprecationWarning, stacklevel=nested + 6 + ) + else: + source.checkgroupname(condname, 1, nested) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1006,16 +1026,10 @@ def addgroup(index, pos): # group c = this[1] if c == "g": - name = "" if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if name.isidentifier(): - try: - index = groupindex[name] - except KeyError: - raise IndexError("unknown group name %r" % name) from None - else: + if not name.isidentifier(): try: index = int(name) if index < 0: @@ -1026,6 +1040,21 @@ def addgroup(index, pos): if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) + if not (name.isdecimal() and name.isascii() and + (name[0] != "0" or name == "0")): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(name) if s.istext else ascii(name), + s.tell() - len(name) - 1), + DeprecationWarning, stacklevel=5 + ) + else: + s.checkgroupname(name, 1, -1) + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) from None addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 781bfd6ea2edac..fc6fb0076d2fcd 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -135,6 +135,7 @@ def test_basic_re_sub(self): self.assertEqual(re.sub('(?Px)', r'\g\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g<1>\g<1>', 'xx'), 'xxxx') + self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx') self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') @@ -274,6 +275,21 @@ def test_symbolic_groups_errors(self): self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\\xc2\\xb5' " + r"at position 4") as w: + re.compile(b'(?P<\xc2\xb5>x)') + self.assertEqual(w.warnings[0].filename, __file__) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\\xc2\\xb5' " + r"at position 4"): + self.checkPatternError(b'(?P=\xc2\xb5)', + r"unknown group name '\xc2\xb5'", 4) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\\xc2\\xb5' " + r"at position 3"): + self.checkPatternError(b'(?(\xc2\xb5)y)', + r"unknown group name '\xc2\xb5'", 3) def test_symbolic_refs(self): self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') @@ -306,12 +322,39 @@ def test_symbolic_refs_errors(self): re.sub('(?Px)', r'\g', 'xx') self.checkTemplateError('(?Px)', r'\g<-1>', 'xx', "bad character in group name '-1'", 3) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\+1' " + r"at position 3") as w: + re.sub('(?Px)', r'\g<+1>', 'xx') + self.assertEqual(w.warnings[0].filename, __file__) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '01' " + r"at position 3"): + re.sub('(?Px)', r'\g<01>', 'xx') + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '1_0' " + r"at position 3"): + re.sub('()'*10, r'\g<1_0>', 'xx') + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name ' 1 ' " + r"at position 3"): + re.sub('(?Px)', r'\g< 1 >', 'xx') self.checkTemplateError('(?Px)', r'\g<©>', 'xx', "bad character in group name '©'", 3) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\\xc2\\xb5' " + r"at position 3") as w: + with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"): + re.sub(b'(?Px)', b'\\g<\xc2\xb5>', b'xx') + self.assertEqual(w.warnings[0].filename, __file__) self.checkTemplateError('(?Px)', r'\g<㊀>', 'xx', "bad character in group name '㊀'", 3) self.checkTemplateError('(?Px)', r'\g<¹>', 'xx', "bad character in group name '¹'", 3) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '१' " + r"at position 3"): + re.sub('(?Px)', r'\g<१>', 'xx') def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -577,10 +620,31 @@ def test_re_groupref_exists_errors(self): self.checkPatternError(r'(?P)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'()(?(-1)a|b)', "bad character in group name '-1'", 5) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '\+1' " + r"at position 5") as w: + re.compile(r'()(?(+1)a|b)') + self.assertEqual(w.warnings[0].filename, __file__) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '01' " + r"at position 5"): + re.compile(r'()(?(01)a|b)') + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '1_0' " + r"at position 23"): + re.compile(r'()'*10 + r'(?(1_0)a|b)') + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name ' 1 ' " + r"at position 5"): + re.compile(r'()(?( 1 )a|b)') self.checkPatternError(r'()(?(㊀)a|b)', "bad character in group name '㊀'", 5) self.checkPatternError(r'()(?(¹)a|b)', "bad character in group name '¹'", 5) + with self.assertWarnsRegex(DeprecationWarning, + r"bad character in group name '१' " + r"at position 5"): + re.compile(r'()(?(१)a|b)') self.checkPatternError(r'()(?(1', "missing ), unterminated name", 5) self.checkPatternError(r'()(?(1)a', diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst new file mode 100644 index 00000000000000..0bddbbe093144c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst @@ -0,0 +1,4 @@ +More strict rules will be applied for numerical group references and group +names in regular expressions. For now, a deprecation warning is emitted for +group references and group names which will be errors in future Python +versions. From bb7272b11d17243e7d3f872976e9bb63a933f0b1 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 23 Apr 2022 19:19:05 +0300 Subject: [PATCH 2/5] Minimize diff. --- Lib/re/_parser.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 3b76dba3ee4a48..1a53584845d6ef 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -786,7 +786,13 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group condname = source.getuntil(")", "group name") - if not condname.isidentifier(): + if condname.isidentifier(): + source.checkgroupname(condname, 1, nested) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: try: condgroup = int(condname) if condgroup < 0: @@ -813,12 +819,6 @@ def _parse(source, state, verbose, nested, first=False): source.tell() - len(condname) - 1), DeprecationWarning, stacklevel=nested + 6 ) - else: - source.checkgroupname(condname, 1, nested) - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1027,7 +1027,13 @@ def addgroup(index, pos): if not s.match("<"): raise s.error("missing <") name = s.getuntil(">", "group name") - if not name.isidentifier(): + if name.isidentifier(): + s.checkgroupname(name, 1, -1) + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) from None + else: try: index = int(name) if index < 0: @@ -1047,12 +1053,6 @@ def addgroup(index, pos): s.tell() - len(name) - 1), DeprecationWarning, stacklevel=5 ) - else: - s.checkgroupname(name, 1, -1) - try: - index = groupindex[name] - except KeyError: - raise IndexError("unknown group name %r" % name) from None addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: From 9965ab2ff88052413cbe23da313e5fa724568906 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 23 Apr 2022 19:40:39 +0300 Subject: [PATCH 3/5] Simplify tests. --- Lib/test/test_re.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 4bbc99a66e7b03..8e0256406be141 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -279,7 +279,7 @@ def test_symbolic_groups_errors(self): r"bad character in group name '\\xc2\\xb5' " r"at position 4") as w: re.compile(b'(?P<\xc2\xb5>x)') - self.assertEqual(w.warnings[0].filename, __file__) + self.assertEqual(w.filename, __file__) with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '\\xc2\\xb5' " r"at position 4"): @@ -326,7 +326,7 @@ def test_symbolic_refs_errors(self): r"bad character in group name '\+1' " r"at position 3") as w: re.sub('(?Px)', r'\g<+1>', 'xx') - self.assertEqual(w.warnings[0].filename, __file__) + self.assertEqual(w.filename, __file__) with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '01' " r"at position 3"): @@ -346,7 +346,7 @@ def test_symbolic_refs_errors(self): r"at position 3") as w: with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"): re.sub(b'(?Px)', b'\\g<\xc2\xb5>', b'xx') - self.assertEqual(w.warnings[0].filename, __file__) + self.assertEqual(w.filename, __file__) self.checkTemplateError('(?Px)', r'\g<㊀>', 'xx', "bad character in group name '㊀'", 3) self.checkTemplateError('(?Px)', r'\g<¹>', 'xx', @@ -624,7 +624,7 @@ def test_re_groupref_exists_errors(self): r"bad character in group name '\+1' " r"at position 5") as w: re.compile(r'()(?(+1)a|b)') - self.assertEqual(w.warnings[0].filename, __file__) + self.assertEqual(w.filename, __file__) with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '01' " r"at position 5"): From 543d9957c6b79354aa5afaa7dd2db35d05ff7e3f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 24 Apr 2022 10:31:00 +0300 Subject: [PATCH 4/5] Remove restriction for group numerical reference starting with 0. --- Doc/library/re.rst | 5 ++--- Doc/whatsnew/3.11.rst | 3 +-- Lib/re/_parser.py | 6 ++---- Lib/test/test_re.py | 8 -------- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 70d96799386e83..3cd9f252fee6f3 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -490,7 +490,7 @@ The special characters are: not with ``''``. .. deprecated:: 3.11 - Group *id* containing anything except ASCII digits or starting with ``0``. + Group *id* containing anything except ASCII digits. The special sequences consist of ``'\'`` and a character from the list below. @@ -1002,8 +1002,7 @@ form. non-empty match. .. deprecated:: 3.11 - Group *id* containing anything except ASCII digits or starting with ``0`` - (except group 0). + Group *id* containing anything except ASCII digits. Group names containing non-ASCII characters in bytes replacement strings. diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 4f7d8d353bd161..e55efd95c8b60b 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -942,8 +942,7 @@ Deprecated * More strict rules will be applied now applied for numerical group references and group names in regular expressions in future Python versions. - Only sequence of ASCII digits not starting with ``0`` (except group 0) will be - now accepted as a numerical reference. + Only sequence of ASCII digits will be now accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore. For now, a deprecation warning is raised for such syntax. diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 1a53584845d6ef..ad0c176a7cfed4 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -810,8 +810,7 @@ def _parse(source, state, verbose, nested, first=False): state.grouprefpos[condgroup] = ( source.tell() - len(condname) - 1 ) - if not (condname.isdecimal() and condname.isascii() and - (condname[0] != "0" or condname == "0")): + if not (condname.isdecimal() and condname.isascii()): import warnings warnings.warn( "bad character in group name %s at position %d" % @@ -1044,8 +1043,7 @@ def addgroup(index, pos): if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) - if not (name.isdecimal() and name.isascii() and - (name[0] != "0" or name == "0")): + if not (name.isdecimal() and name.isascii()): import warnings warnings.warn( "bad character in group name %s at position %d" % diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 051984b067a8d2..c1014753802c92 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -327,10 +327,6 @@ def test_symbolic_refs_errors(self): r"at position 3") as w: re.sub('(?Px)', r'\g<+1>', 'xx') self.assertEqual(w.filename, __file__) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '01' " - r"at position 3"): - re.sub('(?Px)', r'\g<01>', 'xx') with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '1_0' " r"at position 3"): @@ -625,10 +621,6 @@ def test_re_groupref_exists_errors(self): r"at position 5") as w: re.compile(r'()(?(+1)a|b)') self.assertEqual(w.filename, __file__) - with self.assertWarnsRegex(DeprecationWarning, - r"bad character in group name '01' " - r"at position 5"): - re.compile(r'()(?(01)a|b)') with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '1_0' " r"at position 23"): From 7b957312371b1276f6f905dc7e78b96e7dda575a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 25 Apr 2022 13:31:47 +0300 Subject: [PATCH 5/5] Update Doc/whatsnew/3.11.rst --- Doc/whatsnew/3.11.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index e55efd95c8b60b..3de53f19c925e0 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -946,7 +946,7 @@ Deprecated The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore. For now, a deprecation warning is raised for such syntax. - (Contributed by Serhiy Storchaka in :issue:`91760`.) + (Contributed by Serhiy Storchaka in :gh:`91760`.) Removed