Home | History | Annotate | Line # | Download | only in unicode
      1 #!/usr/bin/env python3
      2 #
      3 # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
      4 #
      5 # This file is part of GCC.
      6 #
      7 # GCC is free software; you can redistribute it and/or modify it under
      8 # the terms of the GNU General Public License as published by the Free
      9 # Software Foundation; either version 3, or (at your option) any later
     10 # version.
     11 #
     12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15 # for more details.
     16 #
     17 # You should have received a copy of the GNU General Public License
     18 # along with GCC; see the file COPYING3.  If not see
     19 # <http://www.gnu.org/licenses/>.
     20 
     21 import sys
     22 import os
     23 
     24 if len(sys.argv) != 2:
     25     print("usage: %s <unicode version>" % sys.argv[0], file=sys.stderr)
     26     sys.exit(1)
     27 unicode_version = sys.argv[1]
     28 
     29 # Parse a codepoint in the format output by glibc tools.
     30 def parse_ucn(s):
     31     if not (s.startswith("<U") and s.endswith(">")):
     32         raise ValueError
     33     return int(s[2:-1], base=16)
     34 
     35 # Process a line of width output from utf_gen.py and update global array.
     36 widths = [1] * (1 + 0x10FFFF)
     37 def process_width(line):
     38     # Example lines:
     39     # <UA8FF>	0
     40     # <UA926>...<UA92D>	0
     41 
     42     s = line.split()
     43     width = int(s[1])
     44     r = s[0].split("...")
     45     if len(r) == 1:
     46         begin = parse_ucn(r[0])
     47         end = begin + 1
     48     elif len(r) == 2:
     49         begin = parse_ucn(r[0])
     50         end = parse_ucn(r[1]) + 1
     51     else:
     52         raise ValueError
     53     widths[begin:end] = [width] * (end - begin)
     54 
     55 # To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
     56 # file named UTF-8, which is not configurable.  Then we parse this into the form
     57 # we want it.
     58 os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
     59 processing = False
     60 for line in open("UTF-8", "r"):
     61     if processing:
     62         if line == "END WIDTH\n":
     63             processing = False
     64         else:
     65             try:
     66                 process_width(line)
     67             except (ValueError, IndexError):
     68                 print(e, "warning: ignored unexpected line: %s" % line,
     69                         file=sys.stderr, end="")
     70     elif line == "WIDTH\n":
     71         processing = True
     72 
     73 # All bytes < 256 we treat as width 1.
     74 widths[0:255] = [1] * 255
     75 
     76 # Condense the list to contiguous ranges.
     77 cur_range = [-1, 1]
     78 all_ranges = []
     79 for i, width in enumerate(widths):
     80     if width == cur_range[1]:
     81         cur_range[0] = i
     82     else:
     83         all_ranges.append(cur_range)
     84         cur_range = [i, width]
     85 
     86 # Output the arrays for generated_cpp_wcwidth.h
     87 print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
     88           "with the help of glibc's")
     89 print("    utf8_gen.py, using version %s" % unicode_version,
     90           "of the Unicode standard.  */")
     91 print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
     92 for i, r in enumerate(all_ranges):
     93     if i % 8:
     94         print(" ", end="")
     95     else:
     96         print("\n  ", end="")
     97     print("0x%x," % (r[0]), end="")
     98 print("\n};\n")
     99 print("static const unsigned char wcwidth_widths[] = {", end="")
    100 for i, r in enumerate(all_ranges):
    101     if i % 24:
    102         print(" ", end="")
    103     else:
    104         print("\n  ", end="")
    105     print("%d," % r[1], end="")
    106 print("\n};")
    107