Home | History | Annotate | Line # | Download | only in from_glibc
      1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
      2 #
      3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
      4 # This file is part of the GNU C Library.
      5 #
      6 # The GNU C Library is free software; you can redistribute it and/or
      7 # modify it under the terms of the GNU Lesser General Public
      8 # License as published by the Free Software Foundation; either
      9 # version 2.1 of the License, or (at your option) any later version.
     10 #
     11 # The GNU C Library is distributed in the hope that it will be useful,
     12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 # Lesser General Public License for more details.
     15 #
     16 # You should have received a copy of the GNU Lesser General Public
     17 # License along with the GNU C Library; if not, see
     18 # <https://www.gnu.org/licenses/>.
     19 
     20 '''
     21 This module contains utilities used by the scripts to generate
     22 Unicode data for glibc from upstream Unicode data files.
     23 '''
     24 
     25 import sys
     26 import re
     27 
     28 
     29 # Common locale header.
     30 COMMENT_HEADER = """
     31 % This file is part of the GNU C Library and contains locale data.
     32 % The Free Software Foundation does not claim any copyright interest
     33 % in the locale data contained in this file.  The foregoing does not
     34 % affect the license of the GNU C Library as a whole.  It does not
     35 % exempt you from the conditions of the license if your use would
     36 % otherwise be governed by that license.
     37 """
     38 
     39 # Dictionary holding the entire contents of the UnicodeData.txt file
     40 #
     41 # Contents of this dictionary look like this:
     42 #
     43 # {0: {'category': 'Cc',
     44 #      'title': None,
     45 #      'digit': '',
     46 #      'name': '<control>',
     47 #      'bidi': 'BN',
     48 #      'combining': '0',
     49 #      'comment': '',
     50 #      'oldname': 'NULL',
     51 #      'decomposition': '',
     52 #      'upper': None,
     53 #      'mirrored': 'N',
     54 #      'lower': None,
     55 #      'decdigit': '',
     56 #      'numeric': ''},
     57 #      
     58 # }
     59 UNICODE_ATTRIBUTES = {}
     60 
     61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
     62 #
     63 # Contents of this dictionary look like this:
     64 #
     65 # {917504: ['Default_Ignorable_Code_Point'],
     66 #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
     67 #  
     68 # }
     69 DERIVED_CORE_PROPERTIES = {}
     70 
     71 # Dictionary holding the entire contents of the EastAsianWidths.txt file
     72 #
     73 # Contents of this dictionary look like this:
     74 #
     75 # {0: 'N',  , 45430: 'W', }
     76 EAST_ASIAN_WIDTHS = {}
     77 
     78 def fill_attribute(code_point, fields):
     79     '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
     80 
     81     One entry in the UNICODE_ATTRIBUTES dictionary represents one line
     82     in the UnicodeData.txt file.
     83 
     84     '''
     85     UNICODE_ATTRIBUTES[code_point] =  {
     86         'name': fields[1],          # Character name
     87         'category': fields[2],      # General category
     88         'combining': fields[3],     # Canonical combining classes
     89         'bidi': fields[4],          # Bidirectional category
     90         'decomposition': fields[5], # Character decomposition mapping
     91         'decdigit': fields[6],      # Decimal digit value
     92         'digit': fields[7],         # Digit value
     93         'numeric': fields[8],       # Numeric value
     94         'mirrored': fields[9],      # mirrored
     95         'oldname': fields[10],      # Old Unicode 1.0 name
     96         'comment': fields[11],      # comment
     97         # Uppercase mapping
     98         'upper': int(fields[12], 16) if fields[12] else None,
     99         # Lowercase mapping
    100         'lower': int(fields[13], 16) if fields[13] else None,
    101         # Titlecase mapping
    102         'title': int(fields[14], 16) if fields[14] else None,
    103     }
    104 
    105 def fill_attributes(filename):
    106     '''Stores the entire contents of the UnicodeData.txt file
    107     in the UNICODE_ATTRIBUTES dictionary.
    108 
    109     A typical line for a single code point in UnicodeData.txt looks
    110     like this:
    111 
    112     0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
    113 
    114     Code point ranges are indicated by pairs of lines like this:
    115 
    116     4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    117     9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    118     '''
    119     with open(filename, mode='r') as unicode_data_file:
    120         fields_start = []
    121         for line in unicode_data_file:
    122             fields = line.strip().split(';')
    123             if len(fields) != 15:
    124                 sys.stderr.write(
    125                     'short line in file "%(f)s": %(l)s\n' %{
    126                     'f': filename, 'l': line})
    127                 exit(1)
    128             if fields[2] == 'Cs':
    129                 # Surrogates are UTF-16 artefacts,
    130                 # not real characters. Ignore them.
    131                 fields_start = []
    132                 continue
    133             if fields[1].endswith(', First>'):
    134                 fields_start = fields
    135                 fields_start[1] = fields_start[1].split(',')[0][1:]
    136                 continue
    137             if fields[1].endswith(', Last>'):
    138                 fields[1] = fields[1].split(',')[0][1:]
    139                 if fields[1:] != fields_start[1:]:
    140                     sys.stderr.write(
    141                         'broken code point range in file "%(f)s": %(l)s\n' %{
    142                             'f': filename, 'l': line})
    143                     exit(1)
    144                 for code_point in range(
    145                         int(fields_start[0], 16),
    146                         int(fields[0], 16)+1):
    147                     fill_attribute(code_point, fields)
    148                 fields_start = []
    149                 continue
    150             fill_attribute(int(fields[0], 16), fields)
    151             fields_start = []
    152 
    153 def fill_derived_core_properties(filename):
    154     '''Stores the entire contents of the DerivedCoreProperties.txt file
    155     in the DERIVED_CORE_PROPERTIES dictionary.
    156 
    157     Lines in DerivedCoreProperties.txt are either a code point range like
    158     this:
    159 
    160     0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
    161 
    162     or a single code point like this:
    163 
    164     00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
    165 
    166     '''
    167     with open(filename, mode='r') as derived_core_properties_file:
    168         for line in derived_core_properties_file:
    169             match = re.match(
    170                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
    171                 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
    172                 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
    173                 line)
    174             if not match:
    175                 continue
    176             start = match.group('codepoint1')
    177             end = match.group('codepoint2')
    178             if not end:
    179                 end = start
    180             for code_point in range(int(start, 16), int(end, 16)+1):
    181                 prop = match.group('property')
    182                 if code_point in DERIVED_CORE_PROPERTIES:
    183                     DERIVED_CORE_PROPERTIES[code_point].append(prop)
    184                 else:
    185                     DERIVED_CORE_PROPERTIES[code_point] = [prop]
    186 
    187 def fill_east_asian_widths(filename):
    188     '''Stores the entire contents of the EastAsianWidths.txt file
    189     in the EAST_ASIAN_WIDTHS dictionary.
    190 
    191     Lines in EastAsianWidths.txt are either a code point range like
    192     this:
    193 
    194     9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
    195 
    196     or a single code point like this:
    197 
    198     A015;W           # Lm         YI SYLLABLE WU
    199     '''
    200     with open(filename, mode='r') as east_asian_widths_file:
    201         for line in east_asian_widths_file:
    202             match = re.match(
    203                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
    204                 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
    205                 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
    206                 line)
    207             if not match:
    208                 continue
    209             start = match.group('codepoint1')
    210             end = match.group('codepoint2')
    211             if not end:
    212                 end = start
    213             for code_point in range(int(start, 16), int(end, 16)+1):
    214                 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
    215 
    216 def to_upper(code_point):
    217     '''Returns the code point of the uppercase version
    218     of the given code point'''
    219     if (UNICODE_ATTRIBUTES[code_point]['name']
    220         and UNICODE_ATTRIBUTES[code_point]['upper']):
    221         return UNICODE_ATTRIBUTES[code_point]['upper']
    222     else:
    223         return code_point
    224 
    225 def to_lower(code_point):
    226     '''Returns the code point of the lowercase version
    227     of the given code point'''
    228     if (UNICODE_ATTRIBUTES[code_point]['name']
    229         and UNICODE_ATTRIBUTES[code_point]['lower']):
    230         return UNICODE_ATTRIBUTES[code_point]['lower']
    231     else:
    232         return code_point
    233 
    234 def to_upper_turkish(code_point):
    235     '''Returns the code point of the Turkish uppercase version
    236     of the given code point'''
    237     if code_point == 0x0069:
    238         return 0x0130
    239     return to_upper(code_point)
    240 
    241 def to_lower_turkish(code_point):
    242     '''Returns the code point of the Turkish lowercase version
    243     of the given code point'''
    244     if code_point == 0x0049:
    245         return 0x0131
    246     return to_lower(code_point)
    247 
    248 def to_title(code_point):
    249     '''Returns the code point of the titlecase version
    250     of the given code point'''
    251     if (UNICODE_ATTRIBUTES[code_point]['name']
    252         and UNICODE_ATTRIBUTES[code_point]['title']):
    253         return UNICODE_ATTRIBUTES[code_point]['title']
    254     else:
    255         return code_point
    256 
    257 def is_upper(code_point):
    258     '''Checks whether the character with this code point is uppercase'''
    259     return (to_lower(code_point) != code_point
    260             or (code_point in DERIVED_CORE_PROPERTIES
    261                 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
    262 
    263 def is_lower(code_point):
    264     '''Checks whether the character with this code point is lowercase'''
    265     # Some characters are defined as Lowercase in
    266     # DerivedCoreProperties.txt but do not have a mapping to upper
    267     # case. For example,  U+A72F LATIN LETTER SMALL CAPITAL F is
    268     # one of these.
    269     return (to_upper(code_point) != code_point
    270             # <U00DF> is lowercase, but without simple to_upper mapping.
    271             or code_point == 0x00DF
    272             or (code_point in DERIVED_CORE_PROPERTIES
    273                 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
    274 
    275 def is_alpha(code_point):
    276     '''Checks whether the character with this code point is alphabetic'''
    277     return ((code_point in DERIVED_CORE_PROPERTIES
    278              and
    279              'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
    280             or
    281             # Consider all the non-ASCII digits as alphabetic.
    282             # ISO C 99 forbids us to have them in category digit,
    283             # but we want iswalnum to return true on them.
    284             (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
    285              and not (code_point >= 0x0030 and code_point <= 0x0039)))
    286 
    287 def is_digit(code_point):
    288     '''Checks whether the character with this code point is a digit'''
    289     if False:
    290         return (UNICODE_ATTRIBUTES[code_point]['name']
    291                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
    292         # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
    293         # a zero.  Must add <0> in front of them by hand.
    294     else:
    295         # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
    296         # takes it away:
    297         # 7.25.2.1.5:
    298         #    The iswdigit function tests for any wide character that
    299         #    corresponds to a decimal-digit character (as defined in 5.2.1).
    300         # 5.2.1:
    301         #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
    302         return (code_point >= 0x0030 and code_point <= 0x0039)
    303 
    304 def is_outdigit(code_point):
    305     '''Checks whether the character with this code point is outdigit'''
    306     return (code_point >= 0x0030 and code_point <= 0x0039)
    307 
    308 def is_blank(code_point):
    309     '''Checks whether the character with this code point is blank'''
    310     return (code_point == 0x0009 # '\t'
    311             # Category Zs without mention of '<noBreak>'
    312             or (UNICODE_ATTRIBUTES[code_point]['name']
    313                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
    314                 and '<noBreak>' not in
    315                 UNICODE_ATTRIBUTES[code_point]['decomposition']))
    316 
    317 def is_space(code_point):
    318     '''Checks whether the character with this code point is a space'''
    319     # Dont make U+00A0 a space. Non-breaking space means that all programs
    320     # should treat it like a punctuation character, not like a space.
    321     return (code_point == 0x0020 # ' '
    322             or code_point == 0x000C # '\f'
    323             or code_point == 0x000A # '\n'
    324             or code_point == 0x000D # '\r'
    325             or code_point == 0x0009 # '\t'
    326             or code_point == 0x000B # '\v'
    327             # Categories Zl, Zp, and Zs without mention of "<noBreak>"
    328             or (UNICODE_ATTRIBUTES[code_point]['name']
    329                 and
    330                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
    331                  or
    332                  (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
    333                   and
    334                   '<noBreak>' not in
    335                   UNICODE_ATTRIBUTES[code_point]['decomposition']))))
    336 
    337 def is_cntrl(code_point):
    338     '''Checks whether the character with this code point is
    339     a control character'''
    340     return (UNICODE_ATTRIBUTES[code_point]['name']
    341             and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
    342                  or
    343                  UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
    344 
    345 def is_xdigit(code_point):
    346     '''Checks whether the character with this code point is
    347     a hexadecimal digit'''
    348     if False:
    349         return (is_digit(code_point)
    350                 or (code_point >= 0x0041 and code_point <= 0x0046)
    351                 or (code_point >= 0x0061 and code_point <= 0x0066))
    352     else:
    353         # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
    354         # takes it away:
    355         # 7.25.2.1.12:
    356         #    The iswxdigit function tests for any wide character that
    357         #    corresponds to a hexadecimal-digit character (as defined
    358         #    in 6.4.4.1).
    359         # 6.4.4.1:
    360         #    hexadecimal-digit: one of
    361         #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
    362         return ((code_point >= 0x0030 and code_point  <= 0x0039)
    363                 or (code_point >= 0x0041 and code_point <= 0x0046)
    364                 or (code_point >= 0x0061 and code_point <= 0x0066))
    365 
    366 def is_graph(code_point):
    367     '''Checks whether the character with this code point is
    368     a graphical character'''
    369     return (UNICODE_ATTRIBUTES[code_point]['name']
    370             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
    371             and not is_space(code_point))
    372 
    373 def is_print(code_point):
    374     '''Checks whether the character with this code point is printable'''
    375     return (UNICODE_ATTRIBUTES[code_point]['name']
    376             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
    377             and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
    378 
    379 def is_punct(code_point):
    380     '''Checks whether the character with this code point is punctuation'''
    381     if False:
    382         return (UNICODE_ATTRIBUTES[code_point]['name']
    383                 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
    384     else:
    385         # The traditional POSIX definition of punctuation is every graphic,
    386         # non-alphanumeric character.
    387         return (is_graph(code_point)
    388                 and not is_alpha(code_point)
    389                 and not is_digit(code_point))
    390 
    391 def is_combining(code_point):
    392     '''Checks whether the character with this code point is
    393     a combining character'''
    394     # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
    395     # file. In 3.0.1 it was identical to the union of the general categories
    396     # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
    397     # PropList.txt file, so we take the latter definition.
    398     return (UNICODE_ATTRIBUTES[code_point]['name']
    399             and
    400             UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
    401 
    402 def is_combining_level3(code_point):
    403     '''Checks whether the character with this code point is
    404     a combining level3 character'''
    405     return (is_combining(code_point)
    406             and
    407             int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
    408 
    409 def ucs_symbol(code_point):
    410     '''Return the UCS symbol string for a Unicode character.'''
    411     if code_point < 0x10000:
    412         return '<U{:04X}>'.format(code_point)
    413     else:
    414         return '<U{:08X}>'.format(code_point)
    415 
    416 def ucs_symbol_range(code_point_low, code_point_high):
    417     '''Returns a string UCS symbol string for a code point range.
    418 
    419     Example:
    420 
    421     <U0041>..<U005A>
    422     '''
    423     return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
    424 
    425 def verifications():
    426     '''Tests whether the is_* functions observe the known restrictions'''
    427     for code_point in sorted(UNICODE_ATTRIBUTES):
    428         # toupper restriction: "Only characters specified for the keywords
    429         # lower and upper shall be specified.
    430         if (to_upper(code_point) != code_point
    431             and not (is_lower(code_point) or is_upper(code_point))):
    432             sys.stderr.write(
    433                 ('%(sym)s is not upper|lower '
    434                  + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
    435                     'sym': ucs_symbol(code_point),
    436                     'c': code_point,
    437                     'uc': to_upper(code_point)})
    438         # tolower restriction: "Only characters specified for the keywords
    439         # lower and upper shall be specified.
    440         if (to_lower(code_point) != code_point
    441             and not (is_lower(code_point) or is_upper(code_point))):
    442             sys.stderr.write(
    443                 ('%(sym)s is not upper|lower '
    444                  + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
    445                     'sym': ucs_symbol(code_point),
    446                     'c': code_point,
    447                     'uc': to_lower(code_point)})
    448         # alpha restriction: "Characters classified as either upper or lower
    449         # shall automatically belong to this class.
    450         if ((is_lower(code_point) or is_upper(code_point))
    451              and not is_alpha(code_point)):
    452             sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
    453                 'sym': ucs_symbol(code_point)})
    454         # alpha restriction: No character specified for the keywords cntrl,
    455         # digit, punct or space shall be specified.
    456         if (is_alpha(code_point) and is_cntrl(code_point)):
    457             sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
    458                 'sym': ucs_symbol(code_point)})
    459         if (is_alpha(code_point) and is_digit(code_point)):
    460             sys.stderr.write('%(sym)s is alpha and digit\n' %{
    461                 'sym': ucs_symbol(code_point)})
    462         if (is_alpha(code_point) and is_punct(code_point)):
    463             sys.stderr.write('%(sym)s is alpha and punct\n' %{
    464                 'sym': ucs_symbol(code_point)})
    465         if (is_alpha(code_point) and is_space(code_point)):
    466             sys.stderr.write('%(sym)s is alpha and space\n' %{
    467                 'sym': ucs_symbol(code_point)})
    468         # space restriction: No character specified for the keywords upper,
    469         # lower, alpha, digit, graph or xdigit shall be specified.
    470         # upper, lower, alpha already checked above.
    471         if (is_space(code_point) and is_digit(code_point)):
    472             sys.stderr.write('%(sym)s is space and digit\n' %{
    473                 'sym': ucs_symbol(code_point)})
    474         if (is_space(code_point) and is_graph(code_point)):
    475             sys.stderr.write('%(sym)s is space and graph\n' %{
    476                 'sym': ucs_symbol(code_point)})
    477         if (is_space(code_point) and is_xdigit(code_point)):
    478             sys.stderr.write('%(sym)s is space and xdigit\n' %{
    479                 'sym': ucs_symbol(code_point)})
    480         # cntrl restriction: No character specified for the keywords upper,
    481         # lower, alpha, digit, punct, graph, print or xdigit shall be
    482         # specified.  upper, lower, alpha already checked above.
    483         if (is_cntrl(code_point) and is_digit(code_point)):
    484             sys.stderr.write('%(sym)s is cntrl and digit\n' %{
    485                 'sym': ucs_symbol(code_point)})
    486         if (is_cntrl(code_point) and is_punct(code_point)):
    487             sys.stderr.write('%(sym)s is cntrl and punct\n' %{
    488                 'sym': ucs_symbol(code_point)})
    489         if (is_cntrl(code_point) and is_graph(code_point)):
    490             sys.stderr.write('%(sym)s is cntrl and graph\n' %{
    491                 'sym': ucs_symbol(code_point)})
    492         if (is_cntrl(code_point) and is_print(code_point)):
    493             sys.stderr.write('%(sym)s is cntrl and print\n' %{
    494                 'sym': ucs_symbol(code_point)})
    495         if (is_cntrl(code_point) and is_xdigit(code_point)):
    496             sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
    497                 'sym': ucs_symbol(code_point)})
    498         # punct restriction: No character specified for the keywords upper,
    499         # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
    500         # be specified.  upper, lower, alpha, cntrl already checked above.
    501         if (is_punct(code_point) and is_digit(code_point)):
    502             sys.stderr.write('%(sym)s is punct and digit\n' %{
    503                 'sym': ucs_symbol(code_point)})
    504         if (is_punct(code_point) and is_xdigit(code_point)):
    505             sys.stderr.write('%(sym)s is punct and xdigit\n' %{
    506                 'sym': ucs_symbol(code_point)})
    507         if (is_punct(code_point) and code_point == 0x0020):
    508             sys.stderr.write('%(sym)s is punct\n' %{
    509                 'sym': ucs_symbol(code_point)})
    510         # graph restriction: No character specified for the keyword cntrl
    511         # shall be specified.  Already checked above.
    512 
    513         # print restriction: No character specified for the keyword cntrl
    514         # shall be specified.  Already checked above.
    515 
    516         # graph - print relation: differ only in the <space> character.
    517         # How is this possible if there are more than one space character?!
    518         # I think susv2/xbd/locale.html should speak of space characters,
    519         # not space character.
    520         if (is_print(code_point)
    521             and not (is_graph(code_point) or is_space(code_point))):
    522             sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
    523                 'sym': unicode_utils.ucs_symbol(code_point)})
    524         if (not is_print(code_point)
    525             and (is_graph(code_point) or code_point == 0x0020)):
    526             sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
    527                 'sym': unicode_utils.ucs_symbol(code_point)})
    528