fc-case.py revision a4e54154
1#!/usr/bin/env python3
2#
3# fontconfig/fc-case/fc-case.py
4#
5# Copyright © 2004 Keith Packard
6# Copyright © 2019 Tim-Philipp Müller
7#
8# Permission to use, copy, modify, distribute, and sell this software and its
9# documentation for any purpose is hereby granted without fee, provided that
10# the above copyright notice appear in all copies and that both that
11# copyright notice and this permission notice appear in supporting
12# documentation, and that the name of the author(s) not be used in
13# advertising or publicity pertaining to distribution of the software without
14# specific, written prior permission.  The authors make no
15# representations about the suitability of this software for any purpose.  It
16# is provided "as is" without express or implied warranty.
17#
18# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
19# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
20# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
21# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
22# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
24# PERFORMANCE OF THIS SOFTWARE.
25
26from enum import Enum
27import argparse
28import string
29import sys
30
31class CaseFoldClass(Enum):
32    COMMON = 1
33    FULL = 2
34    SIMPLE = 3
35    TURKIC = 4
36
37class CaseFoldMethod(Enum):
38    RANGE = 0
39    EVEN_ODD = 1
40    FULL = 2
41
42caseFoldClassMap = {
43  'C' : CaseFoldClass.COMMON,
44  'F' : CaseFoldClass.FULL,
45  'S' : CaseFoldClass.SIMPLE,
46  'T' : CaseFoldClass.TURKIC
47}
48
49folds = []
50
51def ucs4_to_utf8(ucs4):
52    utf8_rep = []
53
54    if ucs4 < 0x80:
55        utf8_rep.append(ucs4)
56        bits = -6
57    elif ucs4 < 0x800:
58        utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0)
59        bits = 0
60    elif ucs4 < 0x10000:
61        utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0)
62        bits = 6
63    elif ucs4 < 0x200000:
64        utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0)
65        bits = 12
66    elif ucs4 < 0x4000000:
67        utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8)
68        bits = 18
69    elif ucs4 < 0x80000000:
70        utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC)
71        bits = 24
72    else:
73        return [];
74
75    while bits >= 0:
76        utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80)
77        bits-= 6
78
79    return utf8_rep
80
81def utf8_size(ucs4):
82    return len(ucs4_to_utf8(ucs4))
83
84case_fold_method_name_map = {
85    CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,',
86    CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,',
87    CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,',
88}
89
90if __name__=='__main__':
91    parser = argparse.ArgumentParser()
92    parser.add_argument('case_folding_file')
93    parser.add_argument('--template', dest='template_file', default=None)
94    parser.add_argument('--output', dest='output_file', default=None)
95
96    args = parser.parse_args()
97
98    minFoldChar = None
99    maxFoldChar = None
100    fold = None
101
102    foldChars = []
103    maxFoldChars = 0
104
105    maxExpand = 0
106
107    # Read the standard Unicode CaseFolding.txt file
108    with open(args.case_folding_file, 'r', encoding='utf-8') as casefile:
109        for cnt, line in enumerate(casefile):
110            if not line or not line[0] in string.hexdigits:
111                continue
112
113            # print('Line {}: {}'.format(cnt, line.strip()))
114
115            tokens = line.split('; ')
116
117            if len(tokens) < 3:
118                print('Not enough tokens in line {}'.format(cnt), file=sys.stderr)
119                sys.exit(1)
120
121            # Get upper case value
122            upper = int(tokens.pop(0), 16)
123
124            # Get class
125            cfclass = caseFoldClassMap[tokens.pop(0)]
126
127            # Get list of result characters
128            lower = list(map(lambda s: int(s,16), tokens.pop(0).split()))
129
130            # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower))
131
132            if not minFoldChar:
133                minFoldChar = upper
134
135            maxFoldChar = upper;
136
137            if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]:
138                if len(lower) == 1:
139                    # foldExtends
140                    if fold and fold['method'] == CaseFoldMethod.RANGE:
141                        foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count']
142                    elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD:
143                        foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1)
144                    else:
145                        foldExtends = False
146
147                    if foldExtends:
148                        # This modifies the last fold item in the array too
149                        fold['count'] = upper - fold['upper'] + 1;
150                    else:
151                        fold = {}
152                        fold['upper'] = upper
153                        fold['offset'] = lower[0] - upper;
154                        if fold['offset'] == 1:
155                            fold['method'] = CaseFoldMethod.EVEN_ODD
156                        else:
157                            fold['method'] = CaseFoldMethod.RANGE
158                        fold['count'] = 1
159                        folds.append(fold)
160                    expand = utf8_size (lower[0]) - utf8_size(upper)
161                else:
162                    fold = {}
163                    fold['upper'] = upper
164                    fold['method'] = CaseFoldMethod.FULL
165                    fold['offset'] = len(foldChars)
166
167                    # add chars
168                    for c in lower:
169                        utf8_rep = ucs4_to_utf8(c)
170                        # print('{} -> {}'.format(c,utf8_rep))
171                        for utf8_char in utf8_rep:
172                            foldChars.append(utf8_char)
173
174                    fold['count'] = len(foldChars) - fold['offset']
175                    folds.append(fold)
176
177                    if fold['count'] > maxFoldChars:
178                        maxFoldChars = fold['count']
179
180                    expand = fold['count'] - utf8_size(upper)
181                    if expand > maxExpand:
182                        maxExpand = expand
183
184    # Open output file
185    if args.output_file:
186        sys.stdout = open(args.output_file, 'w', encoding='utf-8')
187
188    # Read the template file
189    if args.template_file:
190        tmpl_file = open(args.template_file, 'r', encoding='utf-8')
191    else:
192        tmpl_file = sys.stdin
193
194    # Scan the input until the marker is found
195    # FIXME: this is a bit silly really, might just as well harcode
196    #        the license header in the script and drop the template
197    for line in tmpl_file:
198        if line.strip() == '@@@':
199            break
200        print(line, end='')
201
202    # Dump these tables
203    print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds)))
204    print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars)))
205    print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars))
206    print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand))
207    print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar))
208    print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar))
209    print('')
210
211    # Dump out ranges
212    print('static const FcCaseFold    fcCaseFold[FC_NUM_CASE_FOLD] = {')
213    for f in folds:
214         short_offset = f['offset']
215         if short_offset < -32367:
216             short_offset += 65536
217         if short_offset > 32368:
218             short_offset -= 65536
219         print('    {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{',
220               f['upper'], case_fold_method_name_map[f['method']],
221               f['count'], short_offset, '}'))
222    print('};\n')
223
224    # Dump out "other" values
225    print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {')
226    for n, c in enumerate(foldChars):
227        if n == len(foldChars) - 1:
228            end = ''
229        elif n % 16 == 15:
230            end = ',\n'
231        else:
232            end = ','
233        print('0x{:02x}'.format(c), end=end)
234    print('\n};')
235
236    # And flush out the rest of the input file
237    for line in tmpl_file:
238        print(line, end='')
239
240    sys.stdout.flush()
241