Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -508,9 +508,17 @@ The special characters are:
Will try to match with ``yes-pattern`` if the group with given *id* or
*name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is
optional and can be omitted. For example,
``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern, which
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern,
which will match with ``'<user@host.com>'`` as well as ``'user@host.com'``,
and will not match with ``'user@host.com>'``.

Note that when ``yes-pattern`` is not matched while the captured group
was set, backtracking clears the capture (the optional group falls
back to its no-match state). For example,
``(<)?\w+(?(1)>)`` applied to ``'<3'`` matches only ``'3'`` at
position 1 with ``group(1) is None``: the engine first consumes the
leading ``<`` to satisfy group 1, fails to match ``>`` at position
2, then retries without consuming ``<``.

.. versionchanged:: 3.12
Group *id* can only contain ASCII digits.
Expand Down
31 changes: 31 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,37 @@ def test_re_groupref_exists_errors(self):
self.checkPatternError(r'()(?(2)a)',
"invalid group reference 2", 5)

def test_re_conditional_drops_capture_on_backtrack(self):
# Issue: a captured optional group is cleared when backtracking
# causes the ``yes-pattern`` of a (?(id/name)yes|no) construct
# to not match after the capture was set. See:
# https://github.com/python/cpython/issues/151819
# Minimal reproduction from the issue:
m = re.search(r'(<)?\w+(?(1)>)', '<3')
self.assertEqual(m.group(), '3')
self.assertEqual(m.span(), (1, 2))
self.assertEqual(m.group(1), None)

# The successful case keeps the capture intact:
m = re.search(r'(<)?\w+(?(1)>)', '<body>')
self.assertEqual(m.group(), '<body>')
self.assertEqual(m.span(), (0, 6))
self.assertEqual(m.group(1), '<')

# Same effect with ``\w`` style groups and a longer input:
m = re.search(r'(<)?[A-Za-z]+(?(1)>)', '<abcXYZ')
self.assertEqual(m.group(), 'abcXYZ')
self.assertEqual(m.span(), (1, 7))
self.assertEqual(m.group(1), None)

# The pattern documented in Re.rst: with yes-pattern failing
# the leading "<" is rerolled.
m = re.search(r'(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)', '<user@host.com')
self.assertEqual(m.group(), 'user@host.com')
self.assertEqual(m.span(), (1, 14))
self.assertEqual(m.group(1), None)
self.assertEqual(m.group(2), 'user@host.com')

def test_re_groupref_exists_validation_bug(self):
for i in range(256):
with self.subTest(code=i):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Clarify the :ref:`re-syntax` for ``(?(id/name)yes-pattern|no-pattern)``:
add a paragraph documenting that backtracking can clear an optional
capture group whose ``yes-pattern`` fails to match. Also correct the
embedded example pattern's expected matches. Issue #151819.
Loading