1#!/usr/bin/env python3
2#
3# fontconfig/fc-lang/fc-lang.py
4#
5# Copyright © 2001-2002 Keith Packard
6# Copyright © 2019 Tim-Philipp Müller
7#
8# Permission to use, copy, modify, distribute, and sell this software and its
9# documentation for any purpose is hereby granted without fee, provided that
10# the above copyright notice appear in all copies and that both that
11# copyright notice and this permission notice appear in supporting
12# documentation, and that the name of the author(s) not be used in
13# advertising or publicity pertaining to distribution of the software without
14# specific, written prior permission.  The authors make no
15# representations about the suitability of this software for any purpose.  It
16# is provided "as is" without express or implied warranty.
17#
18# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
19# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
20# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
21# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
22# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
24# PERFORMANCE OF THIS SOFTWARE.
25
26# fc-lang
27#
28# Read a set of language orthographies and build C declarations for
29# charsets which can then be used to identify which languages are
30# supported by a given font.
31#
32# TODO: this code is not very pythonic, a lot of it is a 1:1 translation
33# of the C code and we could probably simplify it a bit
34import argparse
35import string
36import sys
37import os
38
39# we just store the leaves in a dict, we can order the leaves later if needed
40class CharSet:
41    def __init__(self):
42        self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
43
44    def add_char(self, ucs4):
45        assert ucs4 < 0x01000000
46        leaf_num = ucs4 >> 8
47        if leaf_num in self.leaves:
48            leaf = self.leaves[leaf_num]
49        else:
50            leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
51            self.leaves[leaf_num] = leaf
52        leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
53        #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
54
55    def del_char(self, ucs4):
56        assert ucs4 < 0x01000000
57        leaf_num = ucs4 >> 8
58        if leaf_num in self.leaves:
59            leaf = self.leaves[leaf_num]
60            leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
61            # We don't bother removing the leaf if it's empty */
62            #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
63
64    def equals(self, other_cs):
65        keys = sorted(self.leaves.keys())
66        other_keys = sorted(other_cs.leaves.keys())
67        if len(keys) != len(other_keys):
68            return False
69        for k1, k2 in zip(keys, other_keys):
70            if k1 != k2:
71                return False
72            if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
73                return False
74        return True
75
76# Convert a file name into a name suitable for C declarations
77def get_name(file_name):
78    return file_name.split('.')[0]
79
80# Convert a C name into a language name
81def get_lang(c_name):
82    return c_name.replace('_', '-').replace(' ', '').lower()
83
84def read_orth_file(file_name):
85    lines = []
86    with open(file_name, 'r', encoding='utf-8') as orth_file:
87        for num, line in enumerate(orth_file):
88            if line.startswith('include '):
89                include_fn = line[8:].strip()
90                lines += read_orth_file(include_fn)
91            else:
92                # remove comments and strip whitespaces
93                line = line.split('#')[0].strip()
94                line = line.split('\t')[0].strip()
95                # skip empty lines
96                if line:
97                    lines += [(file_name, num, line)]
98
99    return lines
100
101def leaves_equal(leaf1, leaf2):
102    for v1, v2 in zip(leaf1, leaf2):
103        if v1 != v2:
104            return False
105    return True
106
107# Build a single charset from a source file
108#
109# The file format is quite simple, either
110# a single hex value or a pair separated with a dash
111def parse_orth_file(file_name, lines):
112    charset = CharSet()
113    for fn, num, line in lines:
114        delete_char = line.startswith('-')
115        if delete_char:
116            line = line[1:]
117        if line.find('-') != -1:
118            parts = line.split('-')
119        elif line.find('..') != -1:
120            parts = line.split('..')
121        else:
122            parts = [line]
123
124        start = int(parts.pop(0), 16)
125        end = start
126        if parts:
127            end = int(parts.pop(0), 16)
128        if parts:
129            print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
130
131        for ucs4 in range(start, end+1):
132            if delete_char:
133                charset.del_char(ucs4)
134            else:
135                charset.add_char(ucs4)
136
137    assert charset.equals(charset) # sanity check for the equals function
138
139    return charset
140
141if __name__=='__main__':
142    parser = argparse.ArgumentParser()
143    parser.add_argument('orth_files', nargs='+', help='List of .orth files')
144    parser.add_argument('--directory', dest='directory', default=None)
145    parser.add_argument('--template', dest='template_file', default=None)
146    parser.add_argument('--output', dest='output_file', default=None)
147
148    args = parser.parse_args()
149
150    sets = []
151    names = []
152    langs = []
153    country = []
154
155    total_leaves = 0
156
157    LangCountrySets = {}
158
159    # Open output file
160    if args.output_file:
161        sys.stdout = open(args.output_file, 'w', encoding='utf-8')
162
163    # Read the template file
164    if args.template_file:
165        tmpl_file = open(args.template_file, 'r', encoding='utf-8')
166    else:
167        tmpl_file = sys.stdin
168
169    # Change into source dir if specified (after opening other files)
170    if args.directory:
171        os.chdir(args.directory)
172
173    orth_entries = {}
174    for i, fn in enumerate(args.orth_files):
175        orth_entries[fn] = i
176
177    for fn in sorted(orth_entries.keys()):
178        lines = read_orth_file(fn)
179        charset = parse_orth_file(fn, lines)
180
181        sets.append(charset)
182
183        name = get_name(fn)
184        names.append(name)
185
186        lang = get_lang(name)
187        langs.append(lang)
188        if lang.find('-') != -1:
189            country.append(orth_entries[fn]) # maps to original index
190            language_family = lang.split('-')[0]
191            if not language_family in LangCountrySets:
192              LangCountrySets[language_family] = []
193            LangCountrySets[language_family] += [orth_entries[fn]]
194
195        total_leaves += len(charset.leaves)
196
197    # Find unique leaves
198    leaves = []
199    for s in sets:
200       for leaf_num in sorted(s.leaves.keys()):
201           leaf = s.leaves[leaf_num]
202           is_unique = True
203           for existing_leaf in leaves:
204               if leaves_equal(leaf, existing_leaf):
205                  is_unique = False
206                  break
207           #print('unique: ', is_unique)
208           if is_unique:
209               leaves.append(leaf)
210
211    # Find duplicate charsets
212    duplicate = []
213    for i, s in enumerate(sets):
214        dup_num = None
215        if i >= 1:
216            for j, s_cmp in enumerate(sets):
217                if j >= i:
218                    break
219                if s_cmp.equals(s):
220                    dup_num = j
221                    break
222
223        duplicate.append(dup_num)
224
225    tn = 0
226    off = {}
227    for i, s in enumerate(sets):
228        if duplicate[i]:
229            continue
230        off[i] = tn
231        tn += len(s.leaves)
232
233    # Scan the input until the marker is found
234    # FIXME: this is a bit silly really, might just as well hardcode
235    #        the license header in the script and drop the template
236    for line in tmpl_file:
237        if line.strip() == '@@@':
238            break
239        print(line, end='')
240
241    print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
242
243    print('#define LEAF0       ({} * sizeof (FcLangCharSet))'.format(len(sets)))
244    print('#define OFF0        (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
245    print('#define NUM0        (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
246    print('#define SET(n)      (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
247    print('#define OFF(s,o)    (OFF0 + o * sizeof (uintptr_t) - SET(s))')
248    print('#define NUM(s,n)    (NUM0 + n * sizeof (FcChar16) - SET(s))')
249    print('#define LEAF(o,l)   (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
250    print('#define fcLangCharSets (fcLangData.langCharSets)')
251    print('#define fcLangCharSetIndices (fcLangData.langIndices)')
252    print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
253
254    assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then
255
256    print('''
257static const struct {{
258    FcLangCharSet  langCharSets[{}];
259    FcCharLeaf     leaves[{}];
260    uintptr_t      leaf_offsets[{}];
261    FcChar16       numbers[{}];
262    {}       langIndices[{}];
263    {}       langIndicesInv[{}];
264}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
265                             'FcChar16 ', len(sets), 'FcChar16 ', len(sets)))
266
267    # Dump sets
268    print('{')
269    for i, s in enumerate(sets):
270        if duplicate[i]:
271            j = duplicate[i]
272        else:
273            j = i
274        print('    {{ "{}",  {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
275		langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
276
277    print('},')
278
279    # Dump leaves
280    print('{')
281    for l, leaf in enumerate(leaves):
282        print('    {{ {{ /* {} */'.format(l), end='')
283        for i in range(0, 8): # 256/32 = 8
284            if i % 4 == 0:
285                print('\n   ', end='')
286            print(' 0x{:08x},'.format(leaf[i]), end='')
287        print('\n    } },')
288    print('},')
289
290    # Dump leaves
291    print('{')
292    for i, s in enumerate(sets):
293        if duplicate[i]:
294            continue
295
296        print('    /* {} */'.format(names[i]))
297
298        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
299            leaf = s.leaves[leaf_num]
300            if n % 4 == 0:
301                print('   ', end='')
302            found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
303            assert found, "Couldn't find leaf in unique leaves list!"
304            assert len(found) == 1
305            print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
306            if n % 4 == 3:
307                print('')
308        if len(s.leaves) % 4 != 0:
309            print('')
310
311    print('},')
312
313    print('{')
314    for i, s in enumerate(sets):
315        if duplicate[i]:
316            continue
317
318        print('    /* {} */'.format(names[i]))
319
320        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
321            leaf = s.leaves[leaf_num]
322            if n % 8 == 0:
323                print('   ', end='')
324            print(' 0x{:04x},'.format(leaf_num), end='')
325            if n % 8 == 7:
326                print('')
327        if len(s.leaves) % 8 != 0:
328            print('')
329
330    print('},')
331
332    # langIndices
333    print('{')
334    for i, s in enumerate(sets):
335        fn = '{}.orth'.format(names[i])
336        print('    {}, /* {} */'.format(orth_entries[fn], names[i]))
337    print('},')
338
339    # langIndicesInv
340    print('{')
341    for i, k in enumerate(orth_entries.keys()):
342        name = get_name(k)
343        idx = names.index(name)
344        print('    {}, /* {} */'.format(idx, name))
345    print('}')
346
347    print('};\n')
348
349    print('#define NUM_LANG_CHAR_SET	{}'.format(len(sets)))
350    num_lang_set_map = (len(sets) + 31) // 32;
351    print('#define NUM_LANG_SET_MAP	{}'.format(num_lang_set_map))
352
353    # Dump indices with country codes
354    assert len(country) > 0
355    assert len(LangCountrySets) > 0
356    print('')
357    print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
358    for k in sorted(LangCountrySets.keys()):
359        langset_map = [0] * num_lang_set_map # initialise all zeros
360        for entries_id in LangCountrySets[k]:
361            langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
362        print('    {', end='')
363        for v in langset_map:
364            print(' 0x{:08x},'.format(v), end='')
365        print(' }}, /* {} */'.format(k))
366
367    print('};\n')
368    print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
369
370    # Find ranges for each letter for faster searching
371    # Dump sets start/finish for the fastpath
372    print('static const FcLangCharSetRange  fcLangCharSetRanges[] = {\n')
373    for c in string.ascii_lowercase: # a-z
374        start = 9999
375        stop = -1
376        for i, s in enumerate(sets):
377            if names[i].startswith(c):
378                start = min(start,i)
379                stop = max(stop,i)
380        print('    {{ {}, {} }}, /* {} */'.format(start, stop, c))
381    print('};\n')
382
383    # And flush out the rest of the input file
384    for line in tmpl_file:
385        print(line, end='')
386
387    sys.stdout.flush()
388