Skip to content

Commit a4421f5

Browse files
Implement unicodedata.block()
1 parent 175ab31 commit a4421f5

File tree

8 files changed

+922
-2
lines changed

8 files changed

+922
-2
lines changed

Doc/library/unicodedata.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,18 @@ following functions:
130130
`Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.
131131

132132

133+
.. function:: block(chr, /)
134+
135+
Returns the `block
136+
<https://www.unicode.org/versions/17.0.0/core-spec/chapter-3/#G64189>`_
137+
assigned to the character *chr*. For example::
138+
139+
>>> unicodedata.block('S')
140+
'Basic Latin'
141+
142+
.. versionadded:: next
143+
144+
133145
.. function:: mirrored(chr, /)
134146

135147
Returns the mirrored property assigned to the character *chr* as

Doc/whatsnew/3.15.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,11 @@ unicodedata
11341134
of the character which are related to the above algorithm.
11351135
(Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
11361136

1137+
* Add :func:`~unicodedata.block` function to return the `Unicode block
1138+
<https://www.unicode.org/versions/17.0.0/core-spec/chapter-3/#G64189>`_
1139+
assigned to a character.
1140+
(Contributed by Stan Ulbrych in :gh:`66802`.)
1141+
11371142

11381143
unittest
11391144
--------

Lib/test/test_unicodedata.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,88 @@ def graphemes(*args):
973973
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
974974
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
975975

976+
def test_block(self):
977+
self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
978+
self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
979+
self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
980+
self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
981+
# New in 5.0.0
982+
self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
983+
self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks for Symbols')
984+
# New in 5.1.0
985+
self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
986+
self.assertEqual(self.db.block('\uAA4D'), 'Cham')
987+
# New in 5.2.0
988+
self.assertEqual(self.db.block('\u0816'), 'Samaritan')
989+
self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
990+
self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
991+
self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
992+
# New in 6.0.0
993+
self.assertEqual(self.db.block('\u093A'), 'Devanagari')
994+
self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
995+
# New in 6.1.0
996+
self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
997+
self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
998+
# New in 6.2.0
999+
self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric Supplement')
1000+
self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric Supplement')
1001+
# New in 6.3.0
1002+
self.assertEqual(self.db.block('\u180E'), 'Mongolian')
1003+
self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
1004+
# New in 7.0.0
1005+
self.assertEqual(self.db.block('\u0E33'), 'Thai')
1006+
self.assertEqual(self.db.block('\u0EB3'), 'Lao')
1007+
self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format Controls')
1008+
self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
1009+
self.assertEqual(self.db.block('\U0001163E'), 'Modi')
1010+
# New in 8.0.0
1011+
self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
1012+
self.assertEqual(self.db.block('\U00011726'), 'Ahom')
1013+
# New in 9.0.0
1014+
self.assertEqual(self.db.block('\u0600'), 'Arabic')
1015+
self.assertEqual(self.db.block('\U000E007F'), 'Tags')
1016+
self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
1017+
self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
1018+
# New in 10.0.0
1019+
self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
1020+
self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
1021+
self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
1022+
# New in 11.0.0
1023+
self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
1024+
self.assertEqual(self.db.block('\u07FD'), 'NKo')
1025+
self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
1026+
# New in 12.0.0
1027+
self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
1028+
self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph Format Controls')
1029+
self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
1030+
self.assertEqual(self.db.block('\U00016F87'), 'Miao')
1031+
# New in 13.0.0
1032+
self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
1033+
self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and Punctuation')
1034+
self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
1035+
# New in 14.0.0
1036+
self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
1037+
self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
1038+
# New in 15.0.0
1039+
self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
1040+
self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph Format Controls')
1041+
self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
1042+
self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
1043+
# New in 16.0.0
1044+
self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
1045+
self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
1046+
self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
1047+
self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
1048+
# New in 17.0.0
1049+
self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks Extended')
1050+
self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
1051+
1052+
self.assertRaises(TypeError, self.db.block)
1053+
self.assertRaises(TypeError, self.db.block, b'x')
1054+
self.assertRaises(TypeError, self.db.block, 120)
1055+
self.assertRaises(TypeError, self.db.block, '')
1056+
self.assertRaises(TypeError, self.db.block, 'xx')
1057+
9761058

9771059
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
9781060
db = unicodedata.ucd_3_2_0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add :func:`unicodedata.block` function to return the `Unicode block
2+
<https://www.unicode.org/versions/17.0.0/core-spec/chapter-3/#G64189>`_ of a
3+
character.

Modules/clinic/unicodedata.c.h

Lines changed: 37 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/unicodedata.c

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,27 @@ _getrecord_ex(Py_UCS4 code)
9696
return &_PyUnicode_Database_Records[index];
9797
}
9898

99+
static const char *
100+
_getrecord_block(Py_UCS4 code)
101+
{
102+
int l = 0, h = BLOCK_COUNT - 1;
103+
while (l <= h) {
104+
int m = (l + h) / 2;
105+
if (code < _PyUnicode_Blocks[m].s) {
106+
h = m - 1;
107+
}
108+
else if (code > _PyUnicode_Blocks[m].e) {
109+
l = m + 1;
110+
}
111+
else {
112+
return _PyUnicode_BlockNames[_PyUnicode_Blocks[m].name];
113+
}
114+
}
115+
// Otherwise, return the deefault value per
116+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
117+
return "No_Block";
118+
}
119+
99120
typedef struct {
100121
PyObject *SegmentType;
101122
PyObject *GraphemeBreakIteratorType;
@@ -2066,6 +2087,23 @@ unicodedata_iter_graphemes_impl(PyObject *module, PyObject *unistr,
20662087
return (PyObject*)gbi;
20672088
}
20682089

2090+
/*[clinic input]
2091+
unicodedata.block
2092+
2093+
chr: int(accept={str})
2094+
/
2095+
2096+
Return block assigned to the character chr.
2097+
[clinic start generated code]*/
2098+
2099+
static PyObject *
2100+
unicodedata_block_impl(PyObject *module, int chr)
2101+
/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
2102+
{
2103+
Py_UCS4 c = (Py_UCS4)chr;
2104+
return PyUnicode_FromString(_getrecord_block(c));
2105+
}
2106+
20692107
/*[clinic input]
20702108
unicodedata.grapheme_cluster_break
20712109
@@ -2128,6 +2166,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr)
21282166
// an UCD instance.
21292167
static PyMethodDef unicodedata_functions[] = {
21302168
// Module only functions.
2169+
UNICODEDATA_BLOCK_METHODDEF
21312170
UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
21322171
UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
21332172
UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
@@ -2137,7 +2176,7 @@ static PyMethodDef unicodedata_functions[] = {
21372176

21382177
// The following definitions are shared between the module
21392178
// and the UCD class.
2140-
#define DB_methods (unicodedata_functions + 6)
2179+
#define DB_methods (unicodedata_functions + 7)
21412180

21422181
UNICODEDATA_UCD_DECIMAL_METHODDEF
21432182
UNICODEDATA_UCD_DIGIT_METHODDEF

0 commit comments

Comments
 (0)