Home | History | Annotate | Line # | Download | only in unicode
      1 #!/usr/bin/env python3
      2 #
      3 # Script to generate tables for libstdc++ std::format width estimation.
      4 #
      5 # This file is part of GCC.
      6 #
      7 # GCC is free software; you can redistribute it and/or modify it under
      8 # the terms of the GNU General Public License as published by the Free
      9 # Software Foundation; either version 3, or (at your option) any later
     10 # version.
     11 #
     12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15 # for more details.
     16 #
     17 # You should have received a copy of the GNU General Public License
     18 # along with GCC; see the file COPYING3.  If not see
     19 # <http://www.gnu.org/licenses/>.
     20 
     21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
     22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
     23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
     24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
     25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
     26 # Then run this script and save the output to
     27 # ../../libstdc++-v3/include/bits/unicode-data.h
     28 
     29 import sys
     30 import re
     31 import math
     32 import os
     33 
     34 self = os.path.basename(__file__)
     35 print("// Generated by contrib/unicode/{}, do not edit.".format(self))
     36 print("""
     37 // Copyright The GNU Toolchain Authors.
     38 //
     39 // This file is part of the GNU ISO C++ Library.  This library is free
     40 // software; you can redistribute it and/or modify it under the
     41 // terms of the GNU General Public License as published by the
     42 // Free Software Foundation; either version 3, or (at your option)
     43 // any later version.
     44 
     45 // This library is distributed in the hope that it will be useful,
     46 // but WITHOUT ANY WARRANTY; without even the implied warranty of
     47 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     48 // GNU General Public License for more details.
     49 
     50 // Under Section 7 of GPL version 3, you are granted additional
     51 // permissions described in the GCC Runtime Library Exception, version
     52 // 3.1, as published by the Free Software Foundation.
     53 
     54 // You should have received a copy of the GNU General Public License and
     55 // a copy of the GCC Runtime Library Exception along with this program;
     56 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     57 // <http://www.gnu.org/licenses/>.
     58 
     59 /** @file bits/unicode-data.h
     60  *  This is an internal header file, included by other library headers.
     61  *  Do not attempt to use it directly. @headername{format}
     62  */
     63 """)
     64 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
     65 print('# error "This is not a public header, do not include it directly"')
     66 print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100")
     67 print('# error "Version mismatch for Unicode static data"')
     68 print("#endif\n")
     69 
     70 # Process a list and return a list of tuples (index, val) which are the elements
     71 # in the list that have a different val from the previous element.
     72 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
     73 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
     74 def find_edges(vals, init = None):
     75     edges = []
     76     prev_val = init
     77     for i, v in enumerate(vals):
     78         if v != prev_val:
     79             edges.append((i,v))
     80             prev_val = v
     81     return edges
     82 
     83 all_code_points = []
     84 
     85 # Process a code point value or range of code point values with given property.
     86 def process_code_points(code_points, val):
     87     # Example arguments:
     88     # 1100..115F, x
     89     # 232A, y
     90 
     91     r = code_points.split("..")
     92     if len(r) == 1:
     93         c = int(r[0], base=16)
     94         all_code_points[c] = val
     95     elif len(r) == 2:
     96         begin = int(r[0], base=16)
     97         end = int(r[1], base=16) + 1
     98         all_code_points[begin:end] = [val] * (end - begin)
     99     else:
    100         raise ValueError
    101 
    102 # By default every code point has width 1. This is what the C++ standard says,
    103 # even though the Unicode standard says some code points have width 0.
    104 all_code_points = [1] * (1 + 0x10FFFF)
    105 
    106 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
    107 for line in open("EastAsianWidth.txt", "r"):
    108     # Example lines:
    109     # 3000           ; F
    110     # 3001..3003     ; W
    111     line = line.split("#")[0]
    112     if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
    113         process_code_points(line.split(";")[0], 2)
    114 
    115 # The C++ standard also gives width 2 to the following ranges:
    116 # U+4DC0  U+4DFF (Yijing Hexagram Symbols)
    117 process_code_points("4DC0..4DFF", 2)
    118 # U+1F300  U+1F5FF (Miscellaneous Symbols and Pictographs)
    119 process_code_points("1F300..1F5FF", 2)
    120 # U+1F900  U+1F9FF (Supplemental Symbols and Pictographs)
    121 process_code_points("1F900..1F9FF", 2)
    122 
    123 # Create a list that only contains the code points that have a different width
    124 # to the previous code point.
    125 edges = find_edges(all_code_points, 1)
    126 
    127 # Table for std::__unicode::__format_width(char32_t)
    128 
    129 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
    130 print("  // from EastAsianWidth.txt from the Unicode standard.");
    131 print("  inline constexpr char32_t __width_edges[] = {", end="")
    132 for i, e in enumerate(edges):
    133     if i % 8:
    134         print(" ", end="")
    135     else:
    136         print("\n    ", end="")
    137     c,_ = e
    138     print("{:#x},".format(c), end="")
    139 print("\n  };\n")
    140 
    141 # By default every code point has Grapheme_Cluster_Break=Other.
    142 all_code_points = ["Other"] * (1 + 0x10FFFF)
    143 
    144 # Extract Grapheme_Cluster_Break property for all code points.
    145 for line in open("GraphemeBreakProperty.txt", "r"):
    146     # Example lines:
    147     # "0600..0605", "Prepend"
    148     # "00AD", "Control"
    149     line = line.split("#")[0]
    150     if re.match(r'^[\dA-Fa-f][^;]+;', line):
    151         code_points, grapheme_property = line.split(";")
    152         process_code_points(code_points, grapheme_property.strip())
    153 
    154 edges = find_edges(all_code_points)
    155 gcb_props = {"Other":0}
    156 for c, p in edges:
    157     if p not in gcb_props:
    158         gcb_props[p] = len(gcb_props)
    159 shift_bits = int(math.ceil(math.log2(len(gcb_props))))
    160 
    161 # Enum definition for std::__unicode::_Gcb_property
    162 
    163 print("  enum class _Gcb_property {")
    164 for p in gcb_props.items():
    165     print("    _Gcb_{} = {},".format(p[0],p[1]))
    166 print("  };\n")
    167 
    168 # Tables for std::__unicode::_Grapheme_cluster_state
    169 
    170 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
    171 print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
    172 print("  // Entries are (code_point << shift_bits) + property.")
    173 print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
    174 print("  inline constexpr uint32_t __gcb_edges[] = {", end="")
    175 for i, e in enumerate(edges):
    176     if i % 6:
    177         print(" ", end="")
    178     else:
    179         print("\n    ", end="")
    180     c, p = e
    181     x = (c << shift_bits) + gcb_props[p]
    182     print("{0:#x},".format(x), end="")
    183 print("\n  };\n")
    184 
    185 # By default every code point has Indic_Conjunct_Break=None.
    186 all_code_points = [None] * (1 + 0x10FFFF)
    187 
    188 # Extract Indic_Conjunct_Break property for all code points.
    189 for line in open("DerivedCoreProperties.txt", "r"):
    190     # Example lines:
    191     # 094D       ; InCB; Linker
    192     # 0B71       ; InCB; Consonant
    193     # 0300..034E ; InCB; Extend
    194     line = line.split("#")[0]
    195     if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
    196         code_points, _, incb_property = line.split(";")
    197         process_code_points(code_points, incb_property.strip())
    198 
    199 # Table for std::__unicode::__is_incb_linker
    200 # This table is tiny, so just contains the list of code points.
    201 print("  inline constexpr char32_t __incb_linkers[] = {\n   ", end="")
    202 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
    203     print(" 0x{:04x},".format(i), end="")
    204     all_code_points[i] = None
    205 print("\n  };\n")
    206 
    207 edges = find_edges(all_code_points)
    208 
    209 incb_props = {None:0, "Consonant":1, "Extend":2}
    210 print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
    211 # Table for std::__unicode::__incb_property
    212 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
    213 print("  // from DerivedCoreProperties.txt from the Unicode standard.");
    214 print("  // Entries are (code_point << 2) + property.")
    215 print("  inline constexpr uint32_t __incb_edges[] = {", end="")
    216 for i, e in enumerate(edges):
    217     if i % 6:
    218         print(" ", end="")
    219     else:
    220         print("\n    ", end="")
    221     c, p = e
    222     x = (c << 2) + incb_props[p]
    223     print("{0:#x},".format(x), end="")
    224 print("\n  };\n")
    225 
    226 # By default every code point has Emoji=No.
    227 all_code_points = [False] * (1 + 0x10FFFF)
    228 
    229 # Extract Emoji=Extended_Pictographic for all code points.
    230 for line in open("emoji-data.txt", "r"):
    231     # Example lines:
    232     # 1100..115F ; Extended_Pictographic
    233     # 232A       ; Extended_Pictographic
    234     line = line.split("#")[0]
    235     if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
    236         process_code_points(line.split(";")[0], True)
    237 
    238 edges = find_edges(all_code_points, False)
    239 
    240 # Table for std::__unicode::__is_extended_pictographic
    241 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
    242 print("  // from emoji-data.txt from the Unicode standard.");
    243 print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
    244 for i, e in enumerate(edges):
    245     if i % 8:
    246         print(" ", end="")
    247     else:
    248         print("\n    ", end="")
    249     c,_ = e
    250     print("{:#x},".format(c), end="")
    251 print("\n  };\n")
    252 
    253 # <bits/unicode.h> gives an error if this macro is left defined.
    254 # Do this last, so that the generated output is not usable unless we reach here.
    255 print("#undef _GLIBCXX_GET_UNICODE_DATA")
    256