1 # Utilities to generate Unicode data for glibc from upstream Unicode data. 2 # 3 # Copyright (C) 2014-2024 Free Software Foundation, Inc. 4 # This file is part of the GNU C Library. 5 # 6 # The GNU C Library is free software; you can redistribute it and/or 7 # modify it under the terms of the GNU Lesser General Public 8 # License as published by the Free Software Foundation; either 9 # version 2.1 of the License, or (at your option) any later version. 10 # 11 # The GNU C Library is distributed in the hope that it will be useful, 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 # Lesser General Public License for more details. 15 # 16 # You should have received a copy of the GNU Lesser General Public 17 # License along with the GNU C Library; if not, see 18 # <https://www.gnu.org/licenses/>. 19 20 ''' 21 This module contains utilities used by the scripts to generate 22 Unicode data for glibc from upstream Unicode data files. 23 ''' 24 25 import sys 26 import re 27 28 29 # Common locale header. 30 COMMENT_HEADER = """ 31 % This file is part of the GNU C Library and contains locale data. 32 % The Free Software Foundation does not claim any copyright interest 33 % in the locale data contained in this file. The foregoing does not 34 % affect the license of the GNU C Library as a whole. It does not 35 % exempt you from the conditions of the license if your use would 36 % otherwise be governed by that license. 37 """ 38 39 # Dictionary holding the entire contents of the UnicodeData.txt file 40 # 41 # Contents of this dictionary look like this: 42 # 43 # {0: {'category': 'Cc', 44 # 'title': None, 45 # 'digit': '', 46 # 'name': '<control>', 47 # 'bidi': 'BN', 48 # 'combining': '0', 49 # 'comment': '', 50 # 'oldname': 'NULL', 51 # 'decomposition': '', 52 # 'upper': None, 53 # 'mirrored': 'N', 54 # 'lower': None, 55 # 'decdigit': '', 56 # 'numeric': ''}, 57 # 58 # } 59 UNICODE_ATTRIBUTES = {} 60 61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file 62 # 63 # Contents of this dictionary look like this: 64 # 65 # {917504: ['Default_Ignorable_Code_Point'], 66 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], 67 # 68 # } 69 DERIVED_CORE_PROPERTIES = {} 70 71 # Dictionary holding the entire contents of the EastAsianWidths.txt file 72 # 73 # Contents of this dictionary look like this: 74 # 75 # {0: 'N', , 45430: 'W', } 76 EAST_ASIAN_WIDTHS = {} 77 78 def fill_attribute(code_point, fields): 79 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. 80 81 One entry in the UNICODE_ATTRIBUTES dictionary represents one line 82 in the UnicodeData.txt file. 83 84 ''' 85 UNICODE_ATTRIBUTES[code_point] = { 86 'name': fields[1], # Character name 87 'category': fields[2], # General category 88 'combining': fields[3], # Canonical combining classes 89 'bidi': fields[4], # Bidirectional category 90 'decomposition': fields[5], # Character decomposition mapping 91 'decdigit': fields[6], # Decimal digit value 92 'digit': fields[7], # Digit value 93 'numeric': fields[8], # Numeric value 94 'mirrored': fields[9], # mirrored 95 'oldname': fields[10], # Old Unicode 1.0 name 96 'comment': fields[11], # comment 97 # Uppercase mapping 98 'upper': int(fields[12], 16) if fields[12] else None, 99 # Lowercase mapping 100 'lower': int(fields[13], 16) if fields[13] else None, 101 # Titlecase mapping 102 'title': int(fields[14], 16) if fields[14] else None, 103 } 104 105 def fill_attributes(filename): 106 '''Stores the entire contents of the UnicodeData.txt file 107 in the UNICODE_ATTRIBUTES dictionary. 108 109 A typical line for a single code point in UnicodeData.txt looks 110 like this: 111 112 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; 113 114 Code point ranges are indicated by pairs of lines like this: 115 116 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 117 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 118 ''' 119 with open(filename, mode='r') as unicode_data_file: 120 fields_start = [] 121 for line in unicode_data_file: 122 fields = line.strip().split(';') 123 if len(fields) != 15: 124 sys.stderr.write( 125 'short line in file "%(f)s": %(l)s\n' %{ 126 'f': filename, 'l': line}) 127 exit(1) 128 if fields[2] == 'Cs': 129 # Surrogates are UTF-16 artefacts, 130 # not real characters. Ignore them. 131 fields_start = [] 132 continue 133 if fields[1].endswith(', First>'): 134 fields_start = fields 135 fields_start[1] = fields_start[1].split(',')[0][1:] 136 continue 137 if fields[1].endswith(', Last>'): 138 fields[1] = fields[1].split(',')[0][1:] 139 if fields[1:] != fields_start[1:]: 140 sys.stderr.write( 141 'broken code point range in file "%(f)s": %(l)s\n' %{ 142 'f': filename, 'l': line}) 143 exit(1) 144 for code_point in range( 145 int(fields_start[0], 16), 146 int(fields[0], 16)+1): 147 fill_attribute(code_point, fields) 148 fields_start = [] 149 continue 150 fill_attribute(int(fields[0], 16), fields) 151 fields_start = [] 152 153 def fill_derived_core_properties(filename): 154 '''Stores the entire contents of the DerivedCoreProperties.txt file 155 in the DERIVED_CORE_PROPERTIES dictionary. 156 157 Lines in DerivedCoreProperties.txt are either a code point range like 158 this: 159 160 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z 161 162 or a single code point like this: 163 164 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR 165 166 ''' 167 with open(filename, mode='r') as derived_core_properties_file: 168 for line in derived_core_properties_file: 169 match = re.match( 170 r'^(?P<codepoint1>[0-9A-F]{4,6})' 171 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' 172 + r'\s*;\s*(?P<property>[a-zA-Z_]+)', 173 line) 174 if not match: 175 continue 176 start = match.group('codepoint1') 177 end = match.group('codepoint2') 178 if not end: 179 end = start 180 for code_point in range(int(start, 16), int(end, 16)+1): 181 prop = match.group('property') 182 if code_point in DERIVED_CORE_PROPERTIES: 183 DERIVED_CORE_PROPERTIES[code_point].append(prop) 184 else: 185 DERIVED_CORE_PROPERTIES[code_point] = [prop] 186 187 def fill_east_asian_widths(filename): 188 '''Stores the entire contents of the EastAsianWidths.txt file 189 in the EAST_ASIAN_WIDTHS dictionary. 190 191 Lines in EastAsianWidths.txt are either a code point range like 192 this: 193 194 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> 195 196 or a single code point like this: 197 198 A015;W # Lm YI SYLLABLE WU 199 ''' 200 with open(filename, mode='r') as east_asian_widths_file: 201 for line in east_asian_widths_file: 202 match = re.match( 203 r'^(?P<codepoint1>[0-9A-F]{4,6})' 204 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' 205 +r'\s*;\s*(?P<property>[a-zA-Z]+)', 206 line) 207 if not match: 208 continue 209 start = match.group('codepoint1') 210 end = match.group('codepoint2') 211 if not end: 212 end = start 213 for code_point in range(int(start, 16), int(end, 16)+1): 214 EAST_ASIAN_WIDTHS[code_point] = match.group('property') 215 216 def to_upper(code_point): 217 '''Returns the code point of the uppercase version 218 of the given code point''' 219 if (UNICODE_ATTRIBUTES[code_point]['name'] 220 and UNICODE_ATTRIBUTES[code_point]['upper']): 221 return UNICODE_ATTRIBUTES[code_point]['upper'] 222 else: 223 return code_point 224 225 def to_lower(code_point): 226 '''Returns the code point of the lowercase version 227 of the given code point''' 228 if (UNICODE_ATTRIBUTES[code_point]['name'] 229 and UNICODE_ATTRIBUTES[code_point]['lower']): 230 return UNICODE_ATTRIBUTES[code_point]['lower'] 231 else: 232 return code_point 233 234 def to_upper_turkish(code_point): 235 '''Returns the code point of the Turkish uppercase version 236 of the given code point''' 237 if code_point == 0x0069: 238 return 0x0130 239 return to_upper(code_point) 240 241 def to_lower_turkish(code_point): 242 '''Returns the code point of the Turkish lowercase version 243 of the given code point''' 244 if code_point == 0x0049: 245 return 0x0131 246 return to_lower(code_point) 247 248 def to_title(code_point): 249 '''Returns the code point of the titlecase version 250 of the given code point''' 251 if (UNICODE_ATTRIBUTES[code_point]['name'] 252 and UNICODE_ATTRIBUTES[code_point]['title']): 253 return UNICODE_ATTRIBUTES[code_point]['title'] 254 else: 255 return code_point 256 257 def is_upper(code_point): 258 '''Checks whether the character with this code point is uppercase''' 259 return (to_lower(code_point) != code_point 260 or (code_point in DERIVED_CORE_PROPERTIES 261 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) 262 263 def is_lower(code_point): 264 '''Checks whether the character with this code point is lowercase''' 265 # Some characters are defined as Lowercase in 266 # DerivedCoreProperties.txt but do not have a mapping to upper 267 # case. For example, U+A72F LATIN LETTER SMALL CAPITAL F is 268 # one of these. 269 return (to_upper(code_point) != code_point 270 # <U00DF> is lowercase, but without simple to_upper mapping. 271 or code_point == 0x00DF 272 or (code_point in DERIVED_CORE_PROPERTIES 273 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) 274 275 def is_alpha(code_point): 276 '''Checks whether the character with this code point is alphabetic''' 277 return ((code_point in DERIVED_CORE_PROPERTIES 278 and 279 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) 280 or 281 # Consider all the non-ASCII digits as alphabetic. 282 # ISO C 99 forbids us to have them in category digit, 283 # but we want iswalnum to return true on them. 284 (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' 285 and not (code_point >= 0x0030 and code_point <= 0x0039))) 286 287 def is_digit(code_point): 288 '''Checks whether the character with this code point is a digit''' 289 if False: 290 return (UNICODE_ATTRIBUTES[code_point]['name'] 291 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') 292 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without 293 # a zero. Must add <0> in front of them by hand. 294 else: 295 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 296 # takes it away: 297 # 7.25.2.1.5: 298 # The iswdigit function tests for any wide character that 299 # corresponds to a decimal-digit character (as defined in 5.2.1). 300 # 5.2.1: 301 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 302 return (code_point >= 0x0030 and code_point <= 0x0039) 303 304 def is_outdigit(code_point): 305 '''Checks whether the character with this code point is outdigit''' 306 return (code_point >= 0x0030 and code_point <= 0x0039) 307 308 def is_blank(code_point): 309 '''Checks whether the character with this code point is blank''' 310 return (code_point == 0x0009 # '\t' 311 # Category Zs without mention of '<noBreak>' 312 or (UNICODE_ATTRIBUTES[code_point]['name'] 313 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' 314 and '<noBreak>' not in 315 UNICODE_ATTRIBUTES[code_point]['decomposition'])) 316 317 def is_space(code_point): 318 '''Checks whether the character with this code point is a space''' 319 # Dont make U+00A0 a space. Non-breaking space means that all programs 320 # should treat it like a punctuation character, not like a space. 321 return (code_point == 0x0020 # ' ' 322 or code_point == 0x000C # '\f' 323 or code_point == 0x000A # '\n' 324 or code_point == 0x000D # '\r' 325 or code_point == 0x0009 # '\t' 326 or code_point == 0x000B # '\v' 327 # Categories Zl, Zp, and Zs without mention of "<noBreak>" 328 or (UNICODE_ATTRIBUTES[code_point]['name'] 329 and 330 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] 331 or 332 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] 333 and 334 '<noBreak>' not in 335 UNICODE_ATTRIBUTES[code_point]['decomposition'])))) 336 337 def is_cntrl(code_point): 338 '''Checks whether the character with this code point is 339 a control character''' 340 return (UNICODE_ATTRIBUTES[code_point]['name'] 341 and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' 342 or 343 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) 344 345 def is_xdigit(code_point): 346 '''Checks whether the character with this code point is 347 a hexadecimal digit''' 348 if False: 349 return (is_digit(code_point) 350 or (code_point >= 0x0041 and code_point <= 0x0046) 351 or (code_point >= 0x0061 and code_point <= 0x0066)) 352 else: 353 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 354 # takes it away: 355 # 7.25.2.1.12: 356 # The iswxdigit function tests for any wide character that 357 # corresponds to a hexadecimal-digit character (as defined 358 # in 6.4.4.1). 359 # 6.4.4.1: 360 # hexadecimal-digit: one of 361 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F 362 return ((code_point >= 0x0030 and code_point <= 0x0039) 363 or (code_point >= 0x0041 and code_point <= 0x0046) 364 or (code_point >= 0x0061 and code_point <= 0x0066)) 365 366 def is_graph(code_point): 367 '''Checks whether the character with this code point is 368 a graphical character''' 369 return (UNICODE_ATTRIBUTES[code_point]['name'] 370 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' 371 and not is_space(code_point)) 372 373 def is_print(code_point): 374 '''Checks whether the character with this code point is printable''' 375 return (UNICODE_ATTRIBUTES[code_point]['name'] 376 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' 377 and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) 378 379 def is_punct(code_point): 380 '''Checks whether the character with this code point is punctuation''' 381 if False: 382 return (UNICODE_ATTRIBUTES[code_point]['name'] 383 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) 384 else: 385 # The traditional POSIX definition of punctuation is every graphic, 386 # non-alphanumeric character. 387 return (is_graph(code_point) 388 and not is_alpha(code_point) 389 and not is_digit(code_point)) 390 391 def is_combining(code_point): 392 '''Checks whether the character with this code point is 393 a combining character''' 394 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt 395 # file. In 3.0.1 it was identical to the union of the general categories 396 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the 397 # PropList.txt file, so we take the latter definition. 398 return (UNICODE_ATTRIBUTES[code_point]['name'] 399 and 400 UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) 401 402 def is_combining_level3(code_point): 403 '''Checks whether the character with this code point is 404 a combining level3 character''' 405 return (is_combining(code_point) 406 and 407 int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) 408 409 def ucs_symbol(code_point): 410 '''Return the UCS symbol string for a Unicode character.''' 411 if code_point < 0x10000: 412 return '<U{:04X}>'.format(code_point) 413 else: 414 return '<U{:08X}>'.format(code_point) 415 416 def ucs_symbol_range(code_point_low, code_point_high): 417 '''Returns a string UCS symbol string for a code point range. 418 419 Example: 420 421 <U0041>..<U005A> 422 ''' 423 return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) 424 425 def verifications(): 426 '''Tests whether the is_* functions observe the known restrictions''' 427 for code_point in sorted(UNICODE_ATTRIBUTES): 428 # toupper restriction: "Only characters specified for the keywords 429 # lower and upper shall be specified. 430 if (to_upper(code_point) != code_point 431 and not (is_lower(code_point) or is_upper(code_point))): 432 sys.stderr.write( 433 ('%(sym)s is not upper|lower ' 434 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ 435 'sym': ucs_symbol(code_point), 436 'c': code_point, 437 'uc': to_upper(code_point)}) 438 # tolower restriction: "Only characters specified for the keywords 439 # lower and upper shall be specified. 440 if (to_lower(code_point) != code_point 441 and not (is_lower(code_point) or is_upper(code_point))): 442 sys.stderr.write( 443 ('%(sym)s is not upper|lower ' 444 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ 445 'sym': ucs_symbol(code_point), 446 'c': code_point, 447 'uc': to_lower(code_point)}) 448 # alpha restriction: "Characters classified as either upper or lower 449 # shall automatically belong to this class. 450 if ((is_lower(code_point) or is_upper(code_point)) 451 and not is_alpha(code_point)): 452 sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ 453 'sym': ucs_symbol(code_point)}) 454 # alpha restriction: No character specified for the keywords cntrl, 455 # digit, punct or space shall be specified. 456 if (is_alpha(code_point) and is_cntrl(code_point)): 457 sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ 458 'sym': ucs_symbol(code_point)}) 459 if (is_alpha(code_point) and is_digit(code_point)): 460 sys.stderr.write('%(sym)s is alpha and digit\n' %{ 461 'sym': ucs_symbol(code_point)}) 462 if (is_alpha(code_point) and is_punct(code_point)): 463 sys.stderr.write('%(sym)s is alpha and punct\n' %{ 464 'sym': ucs_symbol(code_point)}) 465 if (is_alpha(code_point) and is_space(code_point)): 466 sys.stderr.write('%(sym)s is alpha and space\n' %{ 467 'sym': ucs_symbol(code_point)}) 468 # space restriction: No character specified for the keywords upper, 469 # lower, alpha, digit, graph or xdigit shall be specified. 470 # upper, lower, alpha already checked above. 471 if (is_space(code_point) and is_digit(code_point)): 472 sys.stderr.write('%(sym)s is space and digit\n' %{ 473 'sym': ucs_symbol(code_point)}) 474 if (is_space(code_point) and is_graph(code_point)): 475 sys.stderr.write('%(sym)s is space and graph\n' %{ 476 'sym': ucs_symbol(code_point)}) 477 if (is_space(code_point) and is_xdigit(code_point)): 478 sys.stderr.write('%(sym)s is space and xdigit\n' %{ 479 'sym': ucs_symbol(code_point)}) 480 # cntrl restriction: No character specified for the keywords upper, 481 # lower, alpha, digit, punct, graph, print or xdigit shall be 482 # specified. upper, lower, alpha already checked above. 483 if (is_cntrl(code_point) and is_digit(code_point)): 484 sys.stderr.write('%(sym)s is cntrl and digit\n' %{ 485 'sym': ucs_symbol(code_point)}) 486 if (is_cntrl(code_point) and is_punct(code_point)): 487 sys.stderr.write('%(sym)s is cntrl and punct\n' %{ 488 'sym': ucs_symbol(code_point)}) 489 if (is_cntrl(code_point) and is_graph(code_point)): 490 sys.stderr.write('%(sym)s is cntrl and graph\n' %{ 491 'sym': ucs_symbol(code_point)}) 492 if (is_cntrl(code_point) and is_print(code_point)): 493 sys.stderr.write('%(sym)s is cntrl and print\n' %{ 494 'sym': ucs_symbol(code_point)}) 495 if (is_cntrl(code_point) and is_xdigit(code_point)): 496 sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ 497 'sym': ucs_symbol(code_point)}) 498 # punct restriction: No character specified for the keywords upper, 499 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall 500 # be specified. upper, lower, alpha, cntrl already checked above. 501 if (is_punct(code_point) and is_digit(code_point)): 502 sys.stderr.write('%(sym)s is punct and digit\n' %{ 503 'sym': ucs_symbol(code_point)}) 504 if (is_punct(code_point) and is_xdigit(code_point)): 505 sys.stderr.write('%(sym)s is punct and xdigit\n' %{ 506 'sym': ucs_symbol(code_point)}) 507 if (is_punct(code_point) and code_point == 0x0020): 508 sys.stderr.write('%(sym)s is punct\n' %{ 509 'sym': ucs_symbol(code_point)}) 510 # graph restriction: No character specified for the keyword cntrl 511 # shall be specified. Already checked above. 512 513 # print restriction: No character specified for the keyword cntrl 514 # shall be specified. Already checked above. 515 516 # graph - print relation: differ only in the <space> character. 517 # How is this possible if there are more than one space character?! 518 # I think susv2/xbd/locale.html should speak of space characters, 519 # not space character. 520 if (is_print(code_point) 521 and not (is_graph(code_point) or is_space(code_point))): 522 sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ 523 'sym': unicode_utils.ucs_symbol(code_point)}) 524 if (not is_print(code_point) 525 and (is_graph(code_point) or code_point == 0x0020)): 526 sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ 527 'sym': unicode_utils.ucs_symbol(code_point)}) 528