1a4e54154Smrg#!/usr/bin/env python3
2a4e54154Smrg#
3a4e54154Smrg# fontconfig/fc-lang/fc-lang.py
4a4e54154Smrg#
5a4e54154Smrg# Copyright © 2001-2002 Keith Packard
6a4e54154Smrg# Copyright © 2019 Tim-Philipp Müller
7a4e54154Smrg#
8a4e54154Smrg# Permission to use, copy, modify, distribute, and sell this software and its
9a4e54154Smrg# documentation for any purpose is hereby granted without fee, provided that
10a4e54154Smrg# the above copyright notice appear in all copies and that both that
11a4e54154Smrg# copyright notice and this permission notice appear in supporting
12a4e54154Smrg# documentation, and that the name of the author(s) not be used in
13a4e54154Smrg# advertising or publicity pertaining to distribution of the software without
14a4e54154Smrg# specific, written prior permission.  The authors make no
15a4e54154Smrg# representations about the suitability of this software for any purpose.  It
16a4e54154Smrg# is provided "as is" without express or implied warranty.
17a4e54154Smrg#
18a4e54154Smrg# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
19a4e54154Smrg# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
20a4e54154Smrg# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
21a4e54154Smrg# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
22a4e54154Smrg# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
23a4e54154Smrg# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
24a4e54154Smrg# PERFORMANCE OF THIS SOFTWARE.
25a4e54154Smrg
26a4e54154Smrg# fc-lang
27a4e54154Smrg#
28a4e54154Smrg# Read a set of language orthographies and build C declarations for
29a4e54154Smrg# charsets which can then be used to identify which languages are
30a4e54154Smrg# supported by a given font.
31a4e54154Smrg#
32a4e54154Smrg# TODO: this code is not very pythonic, a lot of it is a 1:1 translation
33a4e54154Smrg# of the C code and we could probably simplify it a bit
34a4e54154Smrgimport argparse
35a4e54154Smrgimport string
36a4e54154Smrgimport sys
37a4e54154Smrgimport os
38a4e54154Smrg
39a4e54154Smrg# we just store the leaves in a dict, we can order the leaves later if needed
40a4e54154Smrgclass CharSet:
41a4e54154Smrg    def __init__(self):
42a4e54154Smrg        self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
43a4e54154Smrg
44a4e54154Smrg    def add_char(self, ucs4):
45a4e54154Smrg        assert ucs4 < 0x01000000
46a4e54154Smrg        leaf_num = ucs4 >> 8
47a4e54154Smrg        if leaf_num in self.leaves:
48a4e54154Smrg            leaf = self.leaves[leaf_num]
49a4e54154Smrg        else:
50a4e54154Smrg            leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
51a4e54154Smrg            self.leaves[leaf_num] = leaf
52a4e54154Smrg        leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
53a4e54154Smrg        #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
54a4e54154Smrg
55a4e54154Smrg    def del_char(self, ucs4):
56a4e54154Smrg        assert ucs4 < 0x01000000
57a4e54154Smrg        leaf_num = ucs4 >> 8
58a4e54154Smrg        if leaf_num in self.leaves:
59a4e54154Smrg            leaf = self.leaves[leaf_num]
60a4e54154Smrg            leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
61a4e54154Smrg            # We don't bother removing the leaf if it's empty */
62a4e54154Smrg            #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
63a4e54154Smrg
64a4e54154Smrg    def equals(self, other_cs):
65a4e54154Smrg        keys = sorted(self.leaves.keys())
66a4e54154Smrg        other_keys = sorted(other_cs.leaves.keys())
67a4e54154Smrg        if len(keys) != len(other_keys):
68a4e54154Smrg            return False
69a4e54154Smrg        for k1, k2 in zip(keys, other_keys):
70a4e54154Smrg            if k1 != k2:
71a4e54154Smrg                return False
72a4e54154Smrg            if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
73a4e54154Smrg                return False
74a4e54154Smrg        return True
75a4e54154Smrg
76a4e54154Smrg# Convert a file name into a name suitable for C declarations
77a4e54154Smrgdef get_name(file_name):
78a4e54154Smrg    return file_name.split('.')[0]
79a4e54154Smrg
80a4e54154Smrg# Convert a C name into a language name
81a4e54154Smrgdef get_lang(c_name):
82a4e54154Smrg    return c_name.replace('_', '-').replace(' ', '').lower()
83a4e54154Smrg
84a4e54154Smrgdef read_orth_file(file_name):
85a4e54154Smrg    lines = []
86a4e54154Smrg    with open(file_name, 'r', encoding='utf-8') as orth_file:
87a4e54154Smrg        for num, line in enumerate(orth_file):
88a4e54154Smrg            if line.startswith('include '):
89a4e54154Smrg                include_fn = line[8:].strip()
90a4e54154Smrg                lines += read_orth_file(include_fn)
91a4e54154Smrg            else:
92a4e54154Smrg                # remove comments and strip whitespaces
93a4e54154Smrg                line = line.split('#')[0].strip()
94a4e54154Smrg                line = line.split('\t')[0].strip()
95a4e54154Smrg                # skip empty lines
96a4e54154Smrg                if line:
97a4e54154Smrg                    lines += [(file_name, num, line)]
98a4e54154Smrg
99a4e54154Smrg    return lines
100a4e54154Smrg
101a4e54154Smrgdef leaves_equal(leaf1, leaf2):
102a4e54154Smrg    for v1, v2 in zip(leaf1, leaf2):
103a4e54154Smrg        if v1 != v2:
104a4e54154Smrg            return False
105a4e54154Smrg    return True
106a4e54154Smrg
107a4e54154Smrg# Build a single charset from a source file
108a4e54154Smrg#
109a4e54154Smrg# The file format is quite simple, either
110a4e54154Smrg# a single hex value or a pair separated with a dash
111a4e54154Smrgdef parse_orth_file(file_name, lines):
112a4e54154Smrg    charset = CharSet()
113a4e54154Smrg    for fn, num, line in lines:
114a4e54154Smrg        delete_char = line.startswith('-')
115a4e54154Smrg        if delete_char:
116a4e54154Smrg            line = line[1:]
117a4e54154Smrg        if line.find('-') != -1:
118a4e54154Smrg            parts = line.split('-')
119a4e54154Smrg        elif line.find('..') != -1:
120a4e54154Smrg            parts = line.split('..')
121a4e54154Smrg        else:
122a4e54154Smrg            parts = [line]
123a4e54154Smrg
124a4e54154Smrg        start = int(parts.pop(0), 16)
125a4e54154Smrg        end = start
126a4e54154Smrg        if parts:
127a4e54154Smrg            end = int(parts.pop(0), 16)
128a4e54154Smrg        if parts:
129a4e54154Smrg            print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
130a4e54154Smrg
131a4e54154Smrg        for ucs4 in range(start, end+1):
132a4e54154Smrg            if delete_char:
133a4e54154Smrg                charset.del_char(ucs4)
134a4e54154Smrg            else:
135a4e54154Smrg                charset.add_char(ucs4)
136a4e54154Smrg
137a4e54154Smrg    assert charset.equals(charset) # sanity check for the equals function
138a4e54154Smrg
139a4e54154Smrg    return charset
140a4e54154Smrg
141a4e54154Smrgif __name__=='__main__':
142a4e54154Smrg    parser = argparse.ArgumentParser()
143a4e54154Smrg    parser.add_argument('orth_files', nargs='+', help='List of .orth files')
144a4e54154Smrg    parser.add_argument('--directory', dest='directory', default=None)
145a4e54154Smrg    parser.add_argument('--template', dest='template_file', default=None)
146a4e54154Smrg    parser.add_argument('--output', dest='output_file', default=None)
147a4e54154Smrg
148a4e54154Smrg    args = parser.parse_args()
149a4e54154Smrg
150a4e54154Smrg    sets = []
151a4e54154Smrg    names = []
152a4e54154Smrg    langs = []
153a4e54154Smrg    country = []
154a4e54154Smrg
155a4e54154Smrg    total_leaves = 0
156a4e54154Smrg
157a4e54154Smrg    LangCountrySets = {}
158a4e54154Smrg
159a4e54154Smrg    # Open output file
160a4e54154Smrg    if args.output_file:
161a4e54154Smrg        sys.stdout = open(args.output_file, 'w', encoding='utf-8')
162a4e54154Smrg
163a4e54154Smrg    # Read the template file
164a4e54154Smrg    if args.template_file:
165a4e54154Smrg        tmpl_file = open(args.template_file, 'r', encoding='utf-8')
166a4e54154Smrg    else:
167a4e54154Smrg        tmpl_file = sys.stdin
168a4e54154Smrg
169a4e54154Smrg    # Change into source dir if specified (after opening other files)
170a4e54154Smrg    if args.directory:
171a4e54154Smrg        os.chdir(args.directory)
172a4e54154Smrg
173a4e54154Smrg    orth_entries = {}
174a4e54154Smrg    for i, fn in enumerate(args.orth_files):
175a4e54154Smrg        orth_entries[fn] = i
176a4e54154Smrg
177a4e54154Smrg    for fn in sorted(orth_entries.keys()):
178a4e54154Smrg        lines = read_orth_file(fn)
179a4e54154Smrg        charset = parse_orth_file(fn, lines)
180a4e54154Smrg
181a4e54154Smrg        sets.append(charset)
182a4e54154Smrg
183a4e54154Smrg        name = get_name(fn)
184a4e54154Smrg        names.append(name)
185a4e54154Smrg
186a4e54154Smrg        lang = get_lang(name)
187a4e54154Smrg        langs.append(lang)
188a4e54154Smrg        if lang.find('-') != -1:
189a4e54154Smrg            country.append(orth_entries[fn]) # maps to original index
190a4e54154Smrg            language_family = lang.split('-')[0]
191a4e54154Smrg            if not language_family in LangCountrySets:
192a4e54154Smrg              LangCountrySets[language_family] = []
193a4e54154Smrg            LangCountrySets[language_family] += [orth_entries[fn]]
194a4e54154Smrg
195a4e54154Smrg        total_leaves += len(charset.leaves)
196a4e54154Smrg
197a4e54154Smrg    # Find unique leaves
198a4e54154Smrg    leaves = []
199a4e54154Smrg    for s in sets:
200a4e54154Smrg       for leaf_num in sorted(s.leaves.keys()):
201a4e54154Smrg           leaf = s.leaves[leaf_num]
202a4e54154Smrg           is_unique = True
203a4e54154Smrg           for existing_leaf in leaves:
204a4e54154Smrg               if leaves_equal(leaf, existing_leaf):
205a4e54154Smrg                  is_unique = False
206a4e54154Smrg                  break
207a4e54154Smrg           #print('unique: ', is_unique)
208a4e54154Smrg           if is_unique:
209a4e54154Smrg               leaves.append(leaf)
210a4e54154Smrg
211a4e54154Smrg    # Find duplicate charsets
212a4e54154Smrg    duplicate = []
213a4e54154Smrg    for i, s in enumerate(sets):
214a4e54154Smrg        dup_num = None
215a4e54154Smrg        if i >= 1:
216a4e54154Smrg            for j, s_cmp in enumerate(sets):
217a4e54154Smrg                if j >= i:
218a4e54154Smrg                    break
219a4e54154Smrg                if s_cmp.equals(s):
220a4e54154Smrg                    dup_num = j
221a4e54154Smrg                    break
222a4e54154Smrg
223a4e54154Smrg        duplicate.append(dup_num)
224a4e54154Smrg
225a4e54154Smrg    tn = 0
226a4e54154Smrg    off = {}
227a4e54154Smrg    for i, s in enumerate(sets):
228a4e54154Smrg        if duplicate[i]:
229a4e54154Smrg            continue
230a4e54154Smrg        off[i] = tn
231a4e54154Smrg        tn += len(s.leaves)
232a4e54154Smrg
233a4e54154Smrg    # Scan the input until the marker is found
234a4e54154Smrg    # FIXME: this is a bit silly really, might just as well hardcode
235a4e54154Smrg    #        the license header in the script and drop the template
236a4e54154Smrg    for line in tmpl_file:
237a4e54154Smrg        if line.strip() == '@@@':
238a4e54154Smrg            break
239a4e54154Smrg        print(line, end='')
240a4e54154Smrg
241a4e54154Smrg    print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
242a4e54154Smrg
243a4e54154Smrg    print('#define LEAF0       ({} * sizeof (FcLangCharSet))'.format(len(sets)))
244a4e54154Smrg    print('#define OFF0        (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
245a4e54154Smrg    print('#define NUM0        (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
246a4e54154Smrg    print('#define SET(n)      (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
247a4e54154Smrg    print('#define OFF(s,o)    (OFF0 + o * sizeof (uintptr_t) - SET(s))')
248a4e54154Smrg    print('#define NUM(s,n)    (NUM0 + n * sizeof (FcChar16) - SET(s))')
249a4e54154Smrg    print('#define LEAF(o,l)   (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
250a4e54154Smrg    print('#define fcLangCharSets (fcLangData.langCharSets)')
251a4e54154Smrg    print('#define fcLangCharSetIndices (fcLangData.langIndices)')
252a4e54154Smrg    print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
253a4e54154Smrg
254ae02b298Smrg    assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then
255a4e54154Smrg
256a4e54154Smrg    print('''
257a4e54154Smrgstatic const struct {{
258a4e54154Smrg    FcLangCharSet  langCharSets[{}];
259a4e54154Smrg    FcCharLeaf     leaves[{}];
260a4e54154Smrg    uintptr_t      leaf_offsets[{}];
261a4e54154Smrg    FcChar16       numbers[{}];
262a4e54154Smrg    {}       langIndices[{}];
263a4e54154Smrg    {}       langIndicesInv[{}];
264a4e54154Smrg}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
265ae02b298Smrg                             'FcChar16 ', len(sets), 'FcChar16 ', len(sets)))
266a4e54154Smrg
267a4e54154Smrg    # Dump sets
268a4e54154Smrg    print('{')
269a4e54154Smrg    for i, s in enumerate(sets):
270a4e54154Smrg        if duplicate[i]:
271a4e54154Smrg            j = duplicate[i]
272a4e54154Smrg        else:
273a4e54154Smrg            j = i
274a4e54154Smrg        print('    {{ "{}",  {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
275a4e54154Smrg		langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
276a4e54154Smrg
277a4e54154Smrg    print('},')
278a4e54154Smrg
279a4e54154Smrg    # Dump leaves
280a4e54154Smrg    print('{')
281a4e54154Smrg    for l, leaf in enumerate(leaves):
282a4e54154Smrg        print('    {{ {{ /* {} */'.format(l), end='')
283a4e54154Smrg        for i in range(0, 8): # 256/32 = 8
284a4e54154Smrg            if i % 4 == 0:
285a4e54154Smrg                print('\n   ', end='')
286a4e54154Smrg            print(' 0x{:08x},'.format(leaf[i]), end='')
287a4e54154Smrg        print('\n    } },')
288a4e54154Smrg    print('},')
289a4e54154Smrg
290a4e54154Smrg    # Dump leaves
291a4e54154Smrg    print('{')
292a4e54154Smrg    for i, s in enumerate(sets):
293a4e54154Smrg        if duplicate[i]:
294a4e54154Smrg            continue
295a4e54154Smrg
296a4e54154Smrg        print('    /* {} */'.format(names[i]))
297a4e54154Smrg
298a4e54154Smrg        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
299a4e54154Smrg            leaf = s.leaves[leaf_num]
300a4e54154Smrg            if n % 4 == 0:
301a4e54154Smrg                print('   ', end='')
302a4e54154Smrg            found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
303a4e54154Smrg            assert found, "Couldn't find leaf in unique leaves list!"
304a4e54154Smrg            assert len(found) == 1
305a4e54154Smrg            print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
306a4e54154Smrg            if n % 4 == 3:
307a4e54154Smrg                print('')
308a4e54154Smrg        if len(s.leaves) % 4 != 0:
309a4e54154Smrg            print('')
310a4e54154Smrg
311a4e54154Smrg    print('},')
312a4e54154Smrg
313a4e54154Smrg    print('{')
314a4e54154Smrg    for i, s in enumerate(sets):
315a4e54154Smrg        if duplicate[i]:
316a4e54154Smrg            continue
317a4e54154Smrg
318a4e54154Smrg        print('    /* {} */'.format(names[i]))
319a4e54154Smrg
320a4e54154Smrg        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
321a4e54154Smrg            leaf = s.leaves[leaf_num]
322a4e54154Smrg            if n % 8 == 0:
323a4e54154Smrg                print('   ', end='')
324a4e54154Smrg            print(' 0x{:04x},'.format(leaf_num), end='')
325a4e54154Smrg            if n % 8 == 7:
326a4e54154Smrg                print('')
327a4e54154Smrg        if len(s.leaves) % 8 != 0:
328a4e54154Smrg            print('')
329a4e54154Smrg
330a4e54154Smrg    print('},')
331a4e54154Smrg
332a4e54154Smrg    # langIndices
333a4e54154Smrg    print('{')
334a4e54154Smrg    for i, s in enumerate(sets):
335a4e54154Smrg        fn = '{}.orth'.format(names[i])
336a4e54154Smrg        print('    {}, /* {} */'.format(orth_entries[fn], names[i]))
337a4e54154Smrg    print('},')
338a4e54154Smrg
339a4e54154Smrg    # langIndicesInv
340a4e54154Smrg    print('{')
341a4e54154Smrg    for i, k in enumerate(orth_entries.keys()):
342a4e54154Smrg        name = get_name(k)
343a4e54154Smrg        idx = names.index(name)
344a4e54154Smrg        print('    {}, /* {} */'.format(idx, name))
345a4e54154Smrg    print('}')
346a4e54154Smrg
347a4e54154Smrg    print('};\n')
348a4e54154Smrg
349a4e54154Smrg    print('#define NUM_LANG_CHAR_SET	{}'.format(len(sets)))
350a4e54154Smrg    num_lang_set_map = (len(sets) + 31) // 32;
351a4e54154Smrg    print('#define NUM_LANG_SET_MAP	{}'.format(num_lang_set_map))
352a4e54154Smrg
353a4e54154Smrg    # Dump indices with country codes
354a4e54154Smrg    assert len(country) > 0
355a4e54154Smrg    assert len(LangCountrySets) > 0
356a4e54154Smrg    print('')
357a4e54154Smrg    print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
358a4e54154Smrg    for k in sorted(LangCountrySets.keys()):
359a4e54154Smrg        langset_map = [0] * num_lang_set_map # initialise all zeros
360a4e54154Smrg        for entries_id in LangCountrySets[k]:
361a4e54154Smrg            langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
362a4e54154Smrg        print('    {', end='')
363a4e54154Smrg        for v in langset_map:
364a4e54154Smrg            print(' 0x{:08x},'.format(v), end='')
365a4e54154Smrg        print(' }}, /* {} */'.format(k))
366a4e54154Smrg
367a4e54154Smrg    print('};\n')
368a4e54154Smrg    print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
369a4e54154Smrg
370a4e54154Smrg    # Find ranges for each letter for faster searching
371a4e54154Smrg    # Dump sets start/finish for the fastpath
372a4e54154Smrg    print('static const FcLangCharSetRange  fcLangCharSetRanges[] = {\n')
373a4e54154Smrg    for c in string.ascii_lowercase: # a-z
374a4e54154Smrg        start = 9999
375a4e54154Smrg        stop = -1
376a4e54154Smrg        for i, s in enumerate(sets):
377a4e54154Smrg            if names[i].startswith(c):
378a4e54154Smrg                start = min(start,i)
379a4e54154Smrg                stop = max(stop,i)
380a4e54154Smrg        print('    {{ {}, {} }}, /* {} */'.format(start, stop, c))
381a4e54154Smrg    print('};\n')
382a4e54154Smrg
383a4e54154Smrg    # And flush out the rest of the input file
384a4e54154Smrg    for line in tmpl_file:
385a4e54154Smrg        print(line, end='')
386a4e54154Smrg
387a4e54154Smrg    sys.stdout.flush()
388