@@ -198,11 +198,11 @@ def makeunicodedata(unicode, trace):
198198 eastasianwidth = EASTASIANWIDTH_NAMES .index (unicode .widths [char ] or 'N' )
199199 graphemebreak = GRAPHEME_CLUSTER_NAMES .index (unicode .grapheme_breaks [char ] or 'Other' )
200200 extpict = unicode .ext_picts [char ]
201+ bidirectional = BIDIRECTIONAL_NAMES .index (unicode .bidi_classes [char ])
201202 if record :
202203 # extract database properties
203204 category = CATEGORY_NAMES .index (record .general_category )
204205 combining = int (record .canonical_combining_class )
205- bidirectional = BIDIRECTIONAL_NAMES .index (unicode .bidi_classes [char ])
206206 mirrored = record .bidi_mirrored == "Y"
207207 normalizationquickcheck = record .quick_check
208208 incb = INDIC_CONJUNCT_BREAK_NAMES .index (record .incb )
@@ -211,7 +211,6 @@ def makeunicodedata(unicode, trace):
211211 normalizationquickcheck , graphemebreak , incb , extpict ,
212212 )
213213 else :
214- bidirectional = BIDIRECTIONAL_NAMES .index (unicode .bidi_classes [char ])
215214 if eastasianwidth or graphemebreak or extpict or bidirectional :
216215 item = (0 , 0 , bidirectional , 0 , eastasianwidth ,
217216 0 , graphemebreak , 0 , extpict )
@@ -815,48 +814,51 @@ def merge_old_version(version, new, old):
815814 elif k == 2 :
816815 category_changes [i ] = CATEGORY_NAMES .index (value )
817816 elif k == 4 :
817+ # bidi_class changes handled via bidi_classes
818+ pass
819+ elif k == 5 :
818820 # We assume that all normalization changes are in 1:1 mappings
819821 assert " " not in value
820822 normalization_changes .append ((i , value ))
821- elif k == 5 :
823+ elif k == 6 :
822824 # we only support changes where the old value is a single digit
823825 assert value in "0123456789"
824826 decimal_changes [i ] = int (value )
825- elif k == 7 :
827+ elif k == 8 :
826828 # Since 0 encodes "no change", the old value is better not 0
827829 if not value :
828830 numeric_changes [i ] = - 1
829831 else :
830832 numeric_changes [i ] = float (value )
831833 assert numeric_changes [i ] not in (0 , - 1 )
832- elif k == 8 :
834+ elif k == 9 :
833835 if value == 'Y' :
834836 mirrored_changes [i ] = '1'
835837 else :
836838 mirrored_changes [i ] = '0'
837- elif k == 10 :
839+ elif k == 11 :
838840 # change to ISO comment, ignore
839841 pass
840- elif k == 11 :
842+ elif k == 12 :
841843 # change to simple uppercase mapping; ignore
842844 pass
843- elif k == 12 :
845+ elif k == 13 :
844846 # change to simple lowercase mapping; ignore
845847 pass
846- elif k == 13 :
848+ elif k == 14 :
847849 # change to simple titlecase mapping; ignore
848850 pass
849- elif k == 14 :
851+ elif k == 15 :
850852 # change to east asian width
851853 east_asian_width_changes [i ] = EASTASIANWIDTH_NAMES .index (value )
852- elif k == 15 :
854+ elif k == 16 :
853855 # derived property changes; not yet
854856 pass
855- elif k == 16 :
857+ elif k == 17 :
856858 # normalization quickchecks are not performed
857859 # for older versions
858860 pass
859- elif k == 17 :
861+ elif k == 18 :
860862 # The Indic_Conjunct_Break property did not exist for
861863 # older versions
862864 pass
@@ -943,7 +945,7 @@ class UcdRecord:
943945 name : str
944946 general_category : str
945947 canonical_combining_class : str
946- # UnicodeData.bidi_classes
948+ bidi_class : str
947949 decomposition_type : str
948950 decomposition_mapping : str
949951 numeric_type : str
@@ -975,7 +977,7 @@ class UcdRecord:
975977
976978
977979def from_row (row : List [str ]) -> UcdRecord :
978- return UcdRecord (* row [: 4 ], * row [ 5 :] , None , set (), 0 , "None" )
980+ return UcdRecord (* row , None , set (), 0 , "None" )
979981
980982
981983# --------------------------------------------------------------------
@@ -990,17 +992,14 @@ class UnicodeData:
990992 def __init__ (self , version , ideograph_check = True ):
991993 self .changed = []
992994 table = [None ] * 0x110000
993- bidi_classes = [None ] * 0x110000
994995 for s in UcdFile (UNICODE_DATA , version ):
995996 char = int (s [0 ], 16 )
996- bidi_classes [char ] = s [4 ]
997997 table [char ] = from_row (s )
998998
999999 self .derived_name_ranges = []
10001000
10011001 # expand first-last ranges
10021002 field = None
1003- bidi_val = None
10041003 for i in range (0 , 0x110000 ):
10051004 # The file UnicodeData.txt has its own distinct way of
10061005 # expressing ranges. See:
@@ -1009,8 +1008,7 @@ def __init__(self, version, ideograph_check=True):
10091008 if s :
10101009 if s .name [- 6 :] == "First>" :
10111010 s .name = ""
1012- field = dataclasses .astuple (s )[:14 ]
1013- bidi_val = bidi_classes [i ]
1011+ field = dataclasses .astuple (s )[:15 ]
10141012 elif s .name [- 5 :] == "Last>" :
10151013 for j , (rangename , _ ) in enumerate (derived_name_range_names ):
10161014 if s .name .startswith ("<" + rangename ):
@@ -1020,8 +1018,7 @@ def __init__(self, version, ideograph_check=True):
10201018 s .name = ""
10211019 field = None
10221020 elif field :
1023- bidi_classes [i ] = bidi_val
1024- table [i ] = UcdRecord ('%X' % i , * field [1 :], None , set (), 0 , "None" )
1021+ table [i ] = from_row (('%X' % i ,) + field [1 :])
10251022
10261023 # public attributes
10271024 self .filename = UNICODE_DATA % ''
@@ -1079,18 +1076,21 @@ def __init__(self, version, ideograph_check=True):
10791076
10801077 # Read DerivedBidiClass.txt for bidi classes
10811078 # see https://www.unicode.org/reports/tr44/#Missing_Conventions
1079+ bidi_classes = [None ] * 0x110000
1080+ for i in range (0 , 0x110000 ):
1081+ if table [i ] is not None :
1082+ bidi_classes [i ] = table [i ].bidi_class
10821083 if version != '3.2.0' :
10831084 missing_re = re .compile (
1084- r'# @missing: ([\dA-F]+) \.\.( [\dA-F]+); (\w+)'
1085+ r'# @missing: ([\dA-F]+\.\.[\dA-F]+); (\w+)'
10851086 )
10861087 with open_data (DERIVED_BIDI_CLASS , version ) as f :
10871088 for l in f :
10881089 m = missing_re .match (l )
10891090 if not m :
10901091 continue
1091- start , end = int (m [1 ], 16 ), int (m [2 ], 16 )
1092- name = BIDI_LONG_NAMES [m [3 ]]
1093- for i in range (start , end + 1 ):
1092+ name = BIDI_LONG_NAMES [m [2 ]]
1093+ for i in expand_range (m [1 ]):
10941094 bidi_classes [i ] = name
10951095 for char , (bidi ,) in UcdFile (DERIVED_BIDI_CLASS , version ).expanded ():
10961096 bidi_classes [char ] = bidi
0 commit comments