1 #!/usr/bin/env python3 2 # 3 # Script to generate tables for libstdc++ std::format width estimation. 4 # 5 # This file is part of GCC. 6 # 7 # GCC is free software; you can redistribute it and/or modify it under 8 # the terms of the GNU General Public License as published by the Free 9 # Software Foundation; either version 3, or (at your option) any later 10 # version. 11 # 12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 # for more details. 16 # 17 # You should have received a copy of the GNU General Public License 18 # along with GCC; see the file COPYING3. If not see 19 # <http://www.gnu.org/licenses/>. 20 21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest: 22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt 23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt 24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt 25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt 26 # Then run this script and save the output to 27 # ../../libstdc++-v3/include/bits/unicode-data.h 28 29 import sys 30 import re 31 import math 32 import os 33 34 self = os.path.basename(__file__) 35 print("// Generated by contrib/unicode/{}, do not edit.".format(self)) 36 print(""" 37 // Copyright The GNU Toolchain Authors. 38 // 39 // This file is part of the GNU ISO C++ Library. This library is free 40 // software; you can redistribute it and/or modify it under the 41 // terms of the GNU General Public License as published by the 42 // Free Software Foundation; either version 3, or (at your option) 43 // any later version. 44 45 // This library is distributed in the hope that it will be useful, 46 // but WITHOUT ANY WARRANTY; without even the implied warranty of 47 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 48 // GNU General Public License for more details. 49 50 // Under Section 7 of GPL version 3, you are granted additional 51 // permissions described in the GCC Runtime Library Exception, version 52 // 3.1, as published by the Free Software Foundation. 53 54 // You should have received a copy of the GNU General Public License and 55 // a copy of the GCC Runtime Library Exception along with this program; 56 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 57 // <http://www.gnu.org/licenses/>. 58 59 /** @file bits/unicode-data.h 60 * This is an internal header file, included by other library headers. 61 * Do not attempt to use it directly. @headername{format} 62 */ 63 """) 64 print("#ifndef _GLIBCXX_GET_UNICODE_DATA") 65 print('# error "This is not a public header, do not include it directly"') 66 print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100") 67 print('# error "Version mismatch for Unicode static data"') 68 print("#endif\n") 69 70 # Process a list and return a list of tuples (index, val) which are the elements 71 # in the list that have a different val from the previous element. 72 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)] 73 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)] 74 def find_edges(vals, init = None): 75 edges = [] 76 prev_val = init 77 for i, v in enumerate(vals): 78 if v != prev_val: 79 edges.append((i,v)) 80 prev_val = v 81 return edges 82 83 all_code_points = [] 84 85 # Process a code point value or range of code point values with given property. 86 def process_code_points(code_points, val): 87 # Example arguments: 88 # 1100..115F, x 89 # 232A, y 90 91 r = code_points.split("..") 92 if len(r) == 1: 93 c = int(r[0], base=16) 94 all_code_points[c] = val 95 elif len(r) == 2: 96 begin = int(r[0], base=16) 97 end = int(r[1], base=16) + 1 98 all_code_points[begin:end] = [val] * (end - begin) 99 else: 100 raise ValueError 101 102 # By default every code point has width 1. This is what the C++ standard says, 103 # even though the Unicode standard says some code points have width 0. 104 all_code_points = [1] * (1 + 0x10FFFF) 105 106 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F 107 for line in open("EastAsianWidth.txt", "r"): 108 # Example lines: 109 # 3000 ; F 110 # 3001..3003 ; W 111 line = line.split("#")[0] 112 if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line): 113 process_code_points(line.split(";")[0], 2) 114 115 # The C++ standard also gives width 2 to the following ranges: 116 # U+4DC0 U+4DFF (Yijing Hexagram Symbols) 117 process_code_points("4DC0..4DFF", 2) 118 # U+1F300 U+1F5FF (Miscellaneous Symbols and Pictographs) 119 process_code_points("1F300..1F5FF", 2) 120 # U+1F900 U+1F9FF (Supplemental Symbols and Pictographs) 121 process_code_points("1F900..1F9FF", 2) 122 123 # Create a list that only contains the code points that have a different width 124 # to the previous code point. 125 edges = find_edges(all_code_points, 1) 126 127 # Table for std::__unicode::__format_width(char32_t) 128 129 print(" // Table generated by contrib/unicode/gen_std_format_width.py,") 130 print(" // from EastAsianWidth.txt from the Unicode standard."); 131 print(" inline constexpr char32_t __width_edges[] = {", end="") 132 for i, e in enumerate(edges): 133 if i % 8: 134 print(" ", end="") 135 else: 136 print("\n ", end="") 137 c,_ = e 138 print("{:#x},".format(c), end="") 139 print("\n };\n") 140 141 # By default every code point has Grapheme_Cluster_Break=Other. 142 all_code_points = ["Other"] * (1 + 0x10FFFF) 143 144 # Extract Grapheme_Cluster_Break property for all code points. 145 for line in open("GraphemeBreakProperty.txt", "r"): 146 # Example lines: 147 # "0600..0605", "Prepend" 148 # "00AD", "Control" 149 line = line.split("#")[0] 150 if re.match(r'^[\dA-Fa-f][^;]+;', line): 151 code_points, grapheme_property = line.split(";") 152 process_code_points(code_points, grapheme_property.strip()) 153 154 edges = find_edges(all_code_points) 155 gcb_props = {"Other":0} 156 for c, p in edges: 157 if p not in gcb_props: 158 gcb_props[p] = len(gcb_props) 159 shift_bits = int(math.ceil(math.log2(len(gcb_props)))) 160 161 # Enum definition for std::__unicode::_Gcb_property 162 163 print(" enum class _Gcb_property {") 164 for p in gcb_props.items(): 165 print(" _Gcb_{} = {},".format(p[0],p[1])) 166 print(" };\n") 167 168 # Tables for std::__unicode::_Grapheme_cluster_state 169 170 print(" // Values generated by contrib/unicode/gen_std_format_width.py,") 171 print(" // from GraphemeBreakProperty.txt from the Unicode standard."); 172 print(" // Entries are (code_point << shift_bits) + property.") 173 print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits)) 174 print(" inline constexpr uint32_t __gcb_edges[] = {", end="") 175 for i, e in enumerate(edges): 176 if i % 6: 177 print(" ", end="") 178 else: 179 print("\n ", end="") 180 c, p = e 181 x = (c << shift_bits) + gcb_props[p] 182 print("{0:#x},".format(x), end="") 183 print("\n };\n") 184 185 # By default every code point has Indic_Conjunct_Break=None. 186 all_code_points = [None] * (1 + 0x10FFFF) 187 188 # Extract Indic_Conjunct_Break property for all code points. 189 for line in open("DerivedCoreProperties.txt", "r"): 190 # Example lines: 191 # 094D ; InCB; Linker 192 # 0B71 ; InCB; Consonant 193 # 0300..034E ; InCB; Extend 194 line = line.split("#")[0] 195 if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line): 196 code_points, _, incb_property = line.split(";") 197 process_code_points(code_points, incb_property.strip()) 198 199 # Table for std::__unicode::__is_incb_linker 200 # This table is tiny, so just contains the list of code points. 201 print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="") 202 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]: 203 print(" 0x{:04x},".format(i), end="") 204 all_code_points[i] = None 205 print("\n };\n") 206 207 edges = find_edges(all_code_points) 208 209 incb_props = {None:0, "Consonant":1, "Extend":2} 210 print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n") 211 # Table for std::__unicode::__incb_property 212 print(" // Values generated by contrib/unicode/gen_std_format_width.py,") 213 print(" // from DerivedCoreProperties.txt from the Unicode standard."); 214 print(" // Entries are (code_point << 2) + property.") 215 print(" inline constexpr uint32_t __incb_edges[] = {", end="") 216 for i, e in enumerate(edges): 217 if i % 6: 218 print(" ", end="") 219 else: 220 print("\n ", end="") 221 c, p = e 222 x = (c << 2) + incb_props[p] 223 print("{0:#x},".format(x), end="") 224 print("\n };\n") 225 226 # By default every code point has Emoji=No. 227 all_code_points = [False] * (1 + 0x10FFFF) 228 229 # Extract Emoji=Extended_Pictographic for all code points. 230 for line in open("emoji-data.txt", "r"): 231 # Example lines: 232 # 1100..115F ; Extended_Pictographic 233 # 232A ; Extended_Pictographic 234 line = line.split("#")[0] 235 if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line): 236 process_code_points(line.split(";")[0], True) 237 238 edges = find_edges(all_code_points, False) 239 240 # Table for std::__unicode::__is_extended_pictographic 241 print(" // Table generated by contrib/unicode/gen_std_format_width.py,") 242 print(" // from emoji-data.txt from the Unicode standard."); 243 print(" inline constexpr char32_t __xpicto_edges[] = {", end="") 244 for i, e in enumerate(edges): 245 if i % 8: 246 print(" ", end="") 247 else: 248 print("\n ", end="") 249 c,_ = e 250 print("{:#x},".format(c), end="") 251 print("\n };\n") 252 253 # <bits/unicode.h> gives an error if this macro is left defined. 254 # Do this last, so that the generated output is not usable unless we reach here. 255 print("#undef _GLIBCXX_GET_UNICODE_DATA") 256