Skip to content

Commit 388a3b6

Browse files
gh-95555: Add Regional_Indicator, Hex_Digit and ASCII_Hex_Digit properties
They are complete fixed sets, matched as fixed ranges: Regional_Indicator (the 26 symbols A..Z), ASCII_Hex_Digit (the ASCII hex digits, = POSIX xdigit) and Hex_Digit (which adds the fullwidth forms). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 8ca0ebe commit 388a3b6

3 files changed

Lines changed: 30 additions & 6 deletions

File tree

Doc/library/re.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,8 @@ character ``'$'``.
682682
``cntrl``, ``digit``, ``graph``, ``lower``, ``print``, ``space``,
683683
``upper``, ``word`` and ``xdigit``.
684684
* The properties ``ASCII``, ``Any``, ``Assigned``,
685-
``Noncharacter_Code_Point``, ``Join_Control``, ``Pattern_Syntax`` and
685+
``Noncharacter_Code_Point``, ``Join_Control``, ``Regional_Indicator``,
686+
``ASCII_Hex_Digit``, ``Hex_Digit``, ``Pattern_Syntax`` and
686687
``Pattern_White_Space``.
687688

688689
Where a supported property corresponds to a :mod:`unicodedata` accessor or

Lib/re/_properties.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@
2020
# Co, Zl and Zp as fixed ranges (see _GC_ANALYTIC).
2121
#
2222
# * Code-point classes given by fixed ranges (see _analytic_ranges): ASCII,
23-
# Any, Noncharacter_Code_Point, Join_Control, xdigit, cntrl, and the
24-
# immutable Pattern_Syntax and Pattern_White_Space.
23+
# Any, Noncharacter_Code_Point, Join_Control, Regional_Indicator, xdigit,
24+
# ASCII_Hex_Digit, Hex_Digit, cntrl, and the immutable Pattern_Syntax and
25+
# Pattern_White_Space.
2526
#
2627

2728
from ._constants import (
@@ -177,16 +178,27 @@ def _analytic_ranges():
177178
noncharacter = [(0xFDD0, 0xFDEF)]
178179
noncharacter += [(plane | 0xFFFE, plane | 0xFFFF)
179180
for plane in range(0, MAXUNICODE + 1, 0x10000)]
181+
# Regional_Indicator (RI): the 26 enclosed symbols A..Z, a complete fixed
182+
# block (PropList.txt binary property).
183+
regional_indicator = [(0x1F1E6, 0x1F1FF)]
184+
# ASCII_Hex_Digit (= POSIX xdigit) and Hex_Digit, which adds the fullwidth
185+
# forms. Both are complete, fixed sets (PropList.txt binary properties).
186+
ascii_hex = [(0x30, 0x39), (0x41, 0x46), (0x61, 0x66)]
187+
hex_digit = ascii_hex + [(0xFF10, 0xFF19), (0xFF21, 0xFF26), (0xFF41, 0xFF46)]
180188
return {
181189
"ascii": [(0, 0x7F)],
182190
"any": [(0, MAXUNICODE)],
183191
# Join_Control (U+200C ZWNJ, U+200D ZWJ; the Unicode Standard 23.2,
184192
# "Layout Controls"), a PropList.txt binary property.
185193
"joincontrol": [(0x200C, 0x200D)],
194+
"regionalindicator": regional_indicator,
195+
"ri": regional_indicator,
186196
"noncharactercodepoint": noncharacter,
187-
# ASCII hexadecimal digits; the Unicode Hex_Digit property is not
188-
# available from Python.
189-
"xdigit": [(0x30, 0x39), (0x41, 0x46), (0x61, 0x66)],
197+
"xdigit": ascii_hex, # POSIX, ASCII only
198+
"asciihexdigit": ascii_hex,
199+
"ahex": ascii_hex,
200+
"hexdigit": hex_digit,
201+
"hex": hex_digit,
190202
# POSIX cntrl is the General_Category Cc, a fixed set of code points.
191203
"cntrl": _CC_RANGES,
192204
"patternwhitespace": _PATTERN_WHITE_SPACE_RANGES,

Lib/test/test_re.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,17 @@ def test_property_escapes(self):
10491049
self.assertTrue(re.fullmatch(r'\p{Noncharacter_Code_Point}+',
10501050
'\uFDD0\uFFFE\U0010FFFF'))
10511051
self.assertTrue(re.fullmatch(r'\p{Join_Control}+', '\u200C\u200D'))
1052+
self.assertTrue(re.fullmatch(r'\p{Regional_Indicator}+',
1053+
'\U0001F1E6\U0001F1FF'))
1054+
self.assertTrue(re.fullmatch(r'\p{RI}', '\U0001F1FA')) # symbol U
1055+
self.assertIsNone(re.fullmatch(r'\p{RI}', 'U'))
1056+
# Hex_Digit (ASCII hex plus fullwidth) and ASCII_Hex_Digit (= xdigit).
1057+
self.assertTrue(re.fullmatch(r'\p{Hex_Digit}+', '0123456789abcdefABCDEF'))
1058+
self.assertTrue(re.fullmatch(r'\p{Hex}+', '0Af')) # fullwidth
1059+
self.assertTrue(re.fullmatch(r'\p{ASCII_Hex_Digit}+', '0aF'))
1060+
self.assertTrue(re.fullmatch(r'\p{AHex}+', '0aF'))
1061+
self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
1062+
self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
10521063

10531064
# Errors.
10541065
self.checkPatternError(r'\p', 'missing {, expected property name', 2)

0 commit comments

Comments
 (0)