diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 4745c1b98a45543..56d952d88400414 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -508,9 +508,17 @@ The special characters are: Will try to match with ``yes-pattern`` if the group with given *id* or *name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is optional and can be omitted. For example, - ``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern, which - will match with ``''`` as well as ``'user@host.com'``, but - not with ``''``. + ``(<)?(\w+@\w+(?:\.\w+)+)(?(1)>|$)`` is a poor email matching pattern, + which will match with ``''`` as well as ``'user@host.com'``, + and will not match with ``'user@host.com>'``. + + Note that when ``yes-pattern`` is not matched while the captured group + was set, backtracking clears the capture (the optional group falls + back to its no-match state). For example, + ``(<)?\w+(?(1)>)`` applied to ``'<3'`` matches only ``'3'`` at + position 1 with ``group(1) is None``: the engine first consumes the + leading ``<`` to satisfy group 1, fails to match ``>`` at position + 2, then retries without consuming ``<``. .. versionchanged:: 3.12 Group *id* can only contain ASCII digits. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 69d730c49387bee..553db8efb43d272 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -706,6 +706,37 @@ def test_re_groupref_exists_errors(self): self.checkPatternError(r'()(?(2)a)', "invalid group reference 2", 5) + def test_re_conditional_drops_capture_on_backtrack(self): + # Issue: a captured optional group is cleared when backtracking + # causes the ``yes-pattern`` of a (?(id/name)yes|no) construct + # to not match after the capture was set. See: + # https://github.com/python/cpython/issues/151819 + # Minimal reproduction from the issue: + m = re.search(r'(<)?\w+(?(1)>)', '<3') + self.assertEqual(m.group(), '3') + self.assertEqual(m.span(), (1, 2)) + self.assertEqual(m.group(1), None) + + # The successful case keeps the capture intact: + m = re.search(r'(<)?\w+(?(1)>)', '') + self.assertEqual(m.group(), '') + self.assertEqual(m.span(), (0, 6)) + self.assertEqual(m.group(1), '<') + + # Same effect with ``\w`` style groups and a longer input: + m = re.search(r'(<)?[A-Za-z]+(?(1)>)', '|$)', '