dist/fc-lang/fc-lang.py

a4e54154Smrg#!/usr/bin/env python3
a4e54154Smrg#
a4e54154Smrg# fontconfig/fc-lang/fc-lang.py
a4e54154Smrg#
a4e54154Smrg# Copyright © 2001-2002 Keith Packard
a4e54154Smrg# Copyright © 2019 Tim-Philipp Müller
a4e54154Smrg#
a4e54154Smrg# Permission to use, copy, modify, distribute, and sell this software and its
a4e54154Smrg# documentation for any purpose is hereby granted without fee, provided that
a4e54154Smrg# the above copyright notice appear in all copies and that both that
a4e54154Smrg# copyright notice and this permission notice appear in supporting
a4e54154Smrg# documentation, and that the name of the author(s) not be used in
a4e54154Smrg# advertising or publicity pertaining to distribution of the software without
a4e54154Smrg# specific, written prior permission.  The authors make no
a4e54154Smrg# representations about the suitability of this software for any purpose.  It
a4e54154Smrg# is provided "as is" without express or implied warranty.
a4e54154Smrg#
a4e54154Smrg# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
a4e54154Smrg# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
a4e54154Smrg# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
a4e54154Smrg# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
a4e54154Smrg# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
a4e54154Smrg# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
a4e54154Smrg# PERFORMANCE OF THIS SOFTWARE.
a4e54154Smrg
a4e54154Smrg# fc-lang
a4e54154Smrg#
a4e54154Smrg# Read a set of language orthographies and build C declarations for
a4e54154Smrg# charsets which can then be used to identify which languages are
a4e54154Smrg# supported by a given font.
a4e54154Smrg#
a4e54154Smrg# TODO: this code is not very pythonic, a lot of it is a 1:1 translation
a4e54154Smrg# of the C code and we could probably simplify it a bit
a4e54154Smrgimport argparse
a4e54154Smrgimport string
a4e54154Smrgimport sys
a4e54154Smrgimport os
a4e54154Smrg
a4e54154Smrg# we just store the leaves in a dict, we can order the leaves later if needed
a4e54154Smrgclass CharSet:
a4e54154Smrg    def __init__(self):
a4e54154Smrg        self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
a4e54154Smrg
a4e54154Smrg    def add_char(self, ucs4):
a4e54154Smrg        assert ucs4 < 0x01000000
a4e54154Smrg        leaf_num = ucs4 >> 8
a4e54154Smrg        if leaf_num in self.leaves:
a4e54154Smrg            leaf = self.leaves[leaf_num]
a4e54154Smrg        else:
a4e54154Smrg            leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
a4e54154Smrg            self.leaves[leaf_num] = leaf
a4e54154Smrg        leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
a4e54154Smrg        #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
a4e54154Smrg
a4e54154Smrg    def del_char(self, ucs4):
a4e54154Smrg        assert ucs4 < 0x01000000
a4e54154Smrg        leaf_num = ucs4 >> 8
a4e54154Smrg        if leaf_num in self.leaves:
a4e54154Smrg            leaf = self.leaves[leaf_num]
a4e54154Smrg            leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
a4e54154Smrg            # We don't bother removing the leaf if it's empty */
a4e54154Smrg            #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
a4e54154Smrg
a4e54154Smrg    def equals(self, other_cs):
a4e54154Smrg        keys = sorted(self.leaves.keys())
a4e54154Smrg        other_keys = sorted(other_cs.leaves.keys())
a4e54154Smrg        if len(keys) != len(other_keys):
a4e54154Smrg            return False
a4e54154Smrg        for k1, k2 in zip(keys, other_keys):
a4e54154Smrg            if k1 != k2:
a4e54154Smrg                return False
a4e54154Smrg            if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
a4e54154Smrg                return False
a4e54154Smrg        return True
a4e54154Smrg
a4e54154Smrg# Convert a file name into a name suitable for C declarations
a4e54154Smrgdef get_name(file_name):
a4e54154Smrg    return file_name.split('.')[0]
a4e54154Smrg
a4e54154Smrg# Convert a C name into a language name
a4e54154Smrgdef get_lang(c_name):
a4e54154Smrg    return c_name.replace('_', '-').replace(' ', '').lower()
a4e54154Smrg
a4e54154Smrgdef read_orth_file(file_name):
a4e54154Smrg    lines = []
a4e54154Smrg    with open(file_name, 'r', encoding='utf-8') as orth_file:
a4e54154Smrg        for num, line in enumerate(orth_file):
a4e54154Smrg            if line.startswith('include '):
a4e54154Smrg                include_fn = line[8:].strip()
a4e54154Smrg                lines += read_orth_file(include_fn)
a4e54154Smrg            else:
a4e54154Smrg                # remove comments and strip whitespaces
a4e54154Smrg                line = line.split('#')[0].strip()
a4e54154Smrg                line = line.split('\t')[0].strip()
a4e54154Smrg                # skip empty lines
a4e54154Smrg                if line:
a4e54154Smrg                    lines += [(file_name, num, line)]
a4e54154Smrg
a4e54154Smrg    return lines
a4e54154Smrg
a4e54154Smrgdef leaves_equal(leaf1, leaf2):
a4e54154Smrg    for v1, v2 in zip(leaf1, leaf2):
a4e54154Smrg        if v1 != v2:
a4e54154Smrg            return False
a4e54154Smrg    return True
a4e54154Smrg
a4e54154Smrg# Build a single charset from a source file
a4e54154Smrg#
a4e54154Smrg# The file format is quite simple, either
a4e54154Smrg# a single hex value or a pair separated with a dash
a4e54154Smrgdef parse_orth_file(file_name, lines):
a4e54154Smrg    charset = CharSet()
a4e54154Smrg    for fn, num, line in lines:
a4e54154Smrg        delete_char = line.startswith('-')
a4e54154Smrg        if delete_char:
a4e54154Smrg            line = line[1:]
a4e54154Smrg        if line.find('-') != -1:
a4e54154Smrg            parts = line.split('-')
a4e54154Smrg        elif line.find('..') != -1:
a4e54154Smrg            parts = line.split('..')
a4e54154Smrg        else:
a4e54154Smrg            parts = [line]
a4e54154Smrg
a4e54154Smrg        start = int(parts.pop(0), 16)
a4e54154Smrg        end = start
a4e54154Smrg        if parts:
a4e54154Smrg            end = int(parts.pop(0), 16)
a4e54154Smrg        if parts:
a4e54154Smrg            print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
a4e54154Smrg
a4e54154Smrg        for ucs4 in range(start, end+1):
a4e54154Smrg            if delete_char:
a4e54154Smrg                charset.del_char(ucs4)
a4e54154Smrg            else:
a4e54154Smrg                charset.add_char(ucs4)
a4e54154Smrg
a4e54154Smrg    assert charset.equals(charset) # sanity check for the equals function
a4e54154Smrg
a4e54154Smrg    return charset
a4e54154Smrg
a4e54154Smrgif __name__=='__main__':
a4e54154Smrg    parser = argparse.ArgumentParser()
a4e54154Smrg    parser.add_argument('orth_files', nargs='+', help='List of .orth files')
a4e54154Smrg    parser.add_argument('--directory', dest='directory', default=None)
a4e54154Smrg    parser.add_argument('--template', dest='template_file', default=None)
a4e54154Smrg    parser.add_argument('--output', dest='output_file', default=None)
a4e54154Smrg
a4e54154Smrg    args = parser.parse_args()
a4e54154Smrg
a4e54154Smrg    sets = []
a4e54154Smrg    names = []
a4e54154Smrg    langs = []
a4e54154Smrg    country = []
a4e54154Smrg
a4e54154Smrg    total_leaves = 0
a4e54154Smrg
a4e54154Smrg    LangCountrySets = {}
a4e54154Smrg
a4e54154Smrg    # Open output file
a4e54154Smrg    if args.output_file:
a4e54154Smrg        sys.stdout = open(args.output_file, 'w', encoding='utf-8')
a4e54154Smrg
a4e54154Smrg    # Read the template file
a4e54154Smrg    if args.template_file:
a4e54154Smrg        tmpl_file = open(args.template_file, 'r', encoding='utf-8')
a4e54154Smrg    else:
a4e54154Smrg        tmpl_file = sys.stdin
a4e54154Smrg
a4e54154Smrg    # Change into source dir if specified (after opening other files)
a4e54154Smrg    if args.directory:
a4e54154Smrg        os.chdir(args.directory)
a4e54154Smrg
a4e54154Smrg    orth_entries = {}
a4e54154Smrg    for i, fn in enumerate(args.orth_files):
a4e54154Smrg        orth_entries[fn] = i
a4e54154Smrg
a4e54154Smrg    for fn in sorted(orth_entries.keys()):
a4e54154Smrg        lines = read_orth_file(fn)
a4e54154Smrg        charset = parse_orth_file(fn, lines)
a4e54154Smrg
a4e54154Smrg        sets.append(charset)
a4e54154Smrg
a4e54154Smrg        name = get_name(fn)
a4e54154Smrg        names.append(name)
a4e54154Smrg
a4e54154Smrg        lang = get_lang(name)
a4e54154Smrg        langs.append(lang)
a4e54154Smrg        if lang.find('-') != -1:
a4e54154Smrg            country.append(orth_entries[fn]) # maps to original index
a4e54154Smrg            language_family = lang.split('-')[0]
a4e54154Smrg            if not language_family in LangCountrySets:
a4e54154Smrg              LangCountrySets[language_family] = []
a4e54154Smrg            LangCountrySets[language_family] += [orth_entries[fn]]
a4e54154Smrg
a4e54154Smrg        total_leaves += len(charset.leaves)
a4e54154Smrg
a4e54154Smrg    # Find unique leaves
a4e54154Smrg    leaves = []
a4e54154Smrg    for s in sets:
a4e54154Smrg       for leaf_num in sorted(s.leaves.keys()):
a4e54154Smrg           leaf = s.leaves[leaf_num]
a4e54154Smrg           is_unique = True
a4e54154Smrg           for existing_leaf in leaves:
a4e54154Smrg               if leaves_equal(leaf, existing_leaf):
a4e54154Smrg                  is_unique = False
a4e54154Smrg                  break
a4e54154Smrg           #print('unique: ', is_unique)
a4e54154Smrg           if is_unique:
a4e54154Smrg               leaves.append(leaf)
a4e54154Smrg
a4e54154Smrg    # Find duplicate charsets
a4e54154Smrg    duplicate = []
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        dup_num = None
a4e54154Smrg        if i >= 1:
a4e54154Smrg            for j, s_cmp in enumerate(sets):
a4e54154Smrg                if j >= i:
a4e54154Smrg                    break
a4e54154Smrg                if s_cmp.equals(s):
a4e54154Smrg                    dup_num = j
a4e54154Smrg                    break
a4e54154Smrg
a4e54154Smrg        duplicate.append(dup_num)
a4e54154Smrg
a4e54154Smrg    tn = 0
a4e54154Smrg    off = {}
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        if duplicate[i]:
a4e54154Smrg            continue
a4e54154Smrg        off[i] = tn
a4e54154Smrg        tn += len(s.leaves)
a4e54154Smrg
a4e54154Smrg    # Scan the input until the marker is found
a4e54154Smrg    # FIXME: this is a bit silly really, might just as well hardcode
a4e54154Smrg    #        the license header in the script and drop the template
a4e54154Smrg    for line in tmpl_file:
a4e54154Smrg        if line.strip() == '@@@':
a4e54154Smrg            break
a4e54154Smrg        print(line, end='')
a4e54154Smrg
a4e54154Smrg    print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
a4e54154Smrg
a4e54154Smrg    print('#define LEAF0       ({} * sizeof (FcLangCharSet))'.format(len(sets)))
a4e54154Smrg    print('#define OFF0        (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
a4e54154Smrg    print('#define NUM0        (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
a4e54154Smrg    print('#define SET(n)      (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
a4e54154Smrg    print('#define OFF(s,o)    (OFF0 + o * sizeof (uintptr_t) - SET(s))')
a4e54154Smrg    print('#define NUM(s,n)    (NUM0 + n * sizeof (FcChar16) - SET(s))')
a4e54154Smrg    print('#define LEAF(o,l)   (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
a4e54154Smrg    print('#define fcLangCharSets (fcLangData.langCharSets)')
a4e54154Smrg    print('#define fcLangCharSetIndices (fcLangData.langIndices)')
a4e54154Smrg    print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
a4e54154Smrg
ae02b298Smrg    assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then
a4e54154Smrg
a4e54154Smrg    print('''
a4e54154Smrgstatic const struct {{
a4e54154Smrg    FcLangCharSet  langCharSets[{}];
a4e54154Smrg    FcCharLeaf     leaves[{}];
a4e54154Smrg    uintptr_t      leaf_offsets[{}];
a4e54154Smrg    FcChar16       numbers[{}];
a4e54154Smrg    {}       langIndices[{}];
a4e54154Smrg    {}       langIndicesInv[{}];
a4e54154Smrg}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
ae02b298Smrg                             'FcChar16 ', len(sets), 'FcChar16 ', len(sets)))
a4e54154Smrg
a4e54154Smrg    # Dump sets
a4e54154Smrg    print('{')
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        if duplicate[i]:
a4e54154Smrg            j = duplicate[i]
a4e54154Smrg        else:
a4e54154Smrg            j = i
a4e54154Smrg        print('    {{ "{}",  {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
a4e54154Smrg		langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
a4e54154Smrg
a4e54154Smrg    print('},')
a4e54154Smrg
a4e54154Smrg    # Dump leaves
a4e54154Smrg    print('{')
a4e54154Smrg    for l, leaf in enumerate(leaves):
a4e54154Smrg        print('    {{ {{ /* {} */'.format(l), end='')
a4e54154Smrg        for i in range(0, 8): # 256/32 = 8
a4e54154Smrg            if i % 4 == 0:
a4e54154Smrg                print('\n   ', end='')
a4e54154Smrg            print(' 0x{:08x},'.format(leaf[i]), end='')
a4e54154Smrg        print('\n    } },')
a4e54154Smrg    print('},')
a4e54154Smrg
a4e54154Smrg    # Dump leaves
a4e54154Smrg    print('{')
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        if duplicate[i]:
a4e54154Smrg            continue
a4e54154Smrg
a4e54154Smrg        print('    /* {} */'.format(names[i]))
a4e54154Smrg
a4e54154Smrg        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
a4e54154Smrg            leaf = s.leaves[leaf_num]
a4e54154Smrg            if n % 4 == 0:
a4e54154Smrg                print('   ', end='')
a4e54154Smrg            found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
a4e54154Smrg            assert found, "Couldn't find leaf in unique leaves list!"
a4e54154Smrg            assert len(found) == 1
a4e54154Smrg            print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
a4e54154Smrg            if n % 4 == 3:
a4e54154Smrg                print('')
a4e54154Smrg        if len(s.leaves) % 4 != 0:
a4e54154Smrg            print('')
a4e54154Smrg
a4e54154Smrg    print('},')
a4e54154Smrg
a4e54154Smrg    print('{')
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        if duplicate[i]:
a4e54154Smrg            continue
a4e54154Smrg
a4e54154Smrg        print('    /* {} */'.format(names[i]))
a4e54154Smrg
a4e54154Smrg        for n, leaf_num in enumerate(sorted(s.leaves.keys())):
a4e54154Smrg            leaf = s.leaves[leaf_num]
a4e54154Smrg            if n % 8 == 0:
a4e54154Smrg                print('   ', end='')
a4e54154Smrg            print(' 0x{:04x},'.format(leaf_num), end='')
a4e54154Smrg            if n % 8 == 7:
a4e54154Smrg                print('')
a4e54154Smrg        if len(s.leaves) % 8 != 0:
a4e54154Smrg            print('')
a4e54154Smrg
a4e54154Smrg    print('},')
a4e54154Smrg
a4e54154Smrg    # langIndices
a4e54154Smrg    print('{')
a4e54154Smrg    for i, s in enumerate(sets):
a4e54154Smrg        fn = '{}.orth'.format(names[i])
a4e54154Smrg        print('    {}, /* {} */'.format(orth_entries[fn], names[i]))
a4e54154Smrg    print('},')
a4e54154Smrg
a4e54154Smrg    # langIndicesInv
a4e54154Smrg    print('{')
a4e54154Smrg    for i, k in enumerate(orth_entries.keys()):
a4e54154Smrg        name = get_name(k)
a4e54154Smrg        idx = names.index(name)
a4e54154Smrg        print('    {}, /* {} */'.format(idx, name))
a4e54154Smrg    print('}')
a4e54154Smrg
a4e54154Smrg    print('};\n')
a4e54154Smrg
a4e54154Smrg    print('#define NUM_LANG_CHAR_SET	{}'.format(len(sets)))
a4e54154Smrg    num_lang_set_map = (len(sets) + 31) // 32;
a4e54154Smrg    print('#define NUM_LANG_SET_MAP	{}'.format(num_lang_set_map))
a4e54154Smrg
a4e54154Smrg    # Dump indices with country codes
a4e54154Smrg    assert len(country) > 0
a4e54154Smrg    assert len(LangCountrySets) > 0
a4e54154Smrg    print('')
a4e54154Smrg    print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
a4e54154Smrg    for k in sorted(LangCountrySets.keys()):
a4e54154Smrg        langset_map = [0] * num_lang_set_map # initialise all zeros
a4e54154Smrg        for entries_id in LangCountrySets[k]:
a4e54154Smrg            langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
a4e54154Smrg        print('    {', end='')
a4e54154Smrg        for v in langset_map:
a4e54154Smrg            print(' 0x{:08x},'.format(v), end='')
a4e54154Smrg        print(' }}, /* {} */'.format(k))
a4e54154Smrg
a4e54154Smrg    print('};\n')
a4e54154Smrg    print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
a4e54154Smrg
a4e54154Smrg    # Find ranges for each letter for faster searching
a4e54154Smrg    # Dump sets start/finish for the fastpath
a4e54154Smrg    print('static const FcLangCharSetRange  fcLangCharSetRanges[] = {\n')
a4e54154Smrg    for c in string.ascii_lowercase: # a-z
a4e54154Smrg        start = 9999
a4e54154Smrg        stop = -1
a4e54154Smrg        for i, s in enumerate(sets):
a4e54154Smrg            if names[i].startswith(c):
a4e54154Smrg                start = min(start,i)
a4e54154Smrg                stop = max(stop,i)
a4e54154Smrg        print('    {{ {}, {} }}, /* {} */'.format(start, stop, c))
a4e54154Smrg    print('};\n')
a4e54154Smrg
a4e54154Smrg    # And flush out the rest of the input file
a4e54154Smrg    for line in tmpl_file:
a4e54154Smrg        print(line, end='')
a4e54154Smrg
a4e54154Smrg    sys.stdout.flush()