1a4e54154Smrg#!/usr/bin/env python3 2a4e54154Smrg# 3a4e54154Smrg# fontconfig/fc-lang/fc-lang.py 4a4e54154Smrg# 5a4e54154Smrg# Copyright © 2001-2002 Keith Packard 6a4e54154Smrg# Copyright © 2019 Tim-Philipp Müller 7a4e54154Smrg# 8a4e54154Smrg# Permission to use, copy, modify, distribute, and sell this software and its 9a4e54154Smrg# documentation for any purpose is hereby granted without fee, provided that 10a4e54154Smrg# the above copyright notice appear in all copies and that both that 11a4e54154Smrg# copyright notice and this permission notice appear in supporting 12a4e54154Smrg# documentation, and that the name of the author(s) not be used in 13a4e54154Smrg# advertising or publicity pertaining to distribution of the software without 14a4e54154Smrg# specific, written prior permission. The authors make no 15a4e54154Smrg# representations about the suitability of this software for any purpose. It 16a4e54154Smrg# is provided "as is" without express or implied warranty. 17a4e54154Smrg# 18a4e54154Smrg# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 19a4e54154Smrg# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO 20a4e54154Smrg# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR 21a4e54154Smrg# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 22a4e54154Smrg# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 23a4e54154Smrg# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 24a4e54154Smrg# PERFORMANCE OF THIS SOFTWARE. 25a4e54154Smrg 26a4e54154Smrg# fc-lang 27a4e54154Smrg# 28a4e54154Smrg# Read a set of language orthographies and build C declarations for 29a4e54154Smrg# charsets which can then be used to identify which languages are 30a4e54154Smrg# supported by a given font. 31a4e54154Smrg# 32a4e54154Smrg# TODO: this code is not very pythonic, a lot of it is a 1:1 translation 33a4e54154Smrg# of the C code and we could probably simplify it a bit 34a4e54154Smrgimport argparse 35a4e54154Smrgimport string 36a4e54154Smrgimport sys 37a4e54154Smrgimport os 38a4e54154Smrg 39a4e54154Smrg# we just store the leaves in a dict, we can order the leaves later if needed 40a4e54154Smrgclass CharSet: 41a4e54154Smrg def __init__(self): 42a4e54154Smrg self.leaves = {} # leaf_number -> leaf data (= 16 uint32) 43a4e54154Smrg 44a4e54154Smrg def add_char(self, ucs4): 45a4e54154Smrg assert ucs4 < 0x01000000 46a4e54154Smrg leaf_num = ucs4 >> 8 47a4e54154Smrg if leaf_num in self.leaves: 48a4e54154Smrg leaf = self.leaves[leaf_num] 49a4e54154Smrg else: 50a4e54154Smrg leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8 51a4e54154Smrg self.leaves[leaf_num] = leaf 52a4e54154Smrg leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f)) 53a4e54154Smrg #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) 54a4e54154Smrg 55a4e54154Smrg def del_char(self, ucs4): 56a4e54154Smrg assert ucs4 < 0x01000000 57a4e54154Smrg leaf_num = ucs4 >> 8 58a4e54154Smrg if leaf_num in self.leaves: 59a4e54154Smrg leaf = self.leaves[leaf_num] 60a4e54154Smrg leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f)) 61a4e54154Smrg # We don't bother removing the leaf if it's empty */ 62a4e54154Smrg #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) 63a4e54154Smrg 64a4e54154Smrg def equals(self, other_cs): 65a4e54154Smrg keys = sorted(self.leaves.keys()) 66a4e54154Smrg other_keys = sorted(other_cs.leaves.keys()) 67a4e54154Smrg if len(keys) != len(other_keys): 68a4e54154Smrg return False 69a4e54154Smrg for k1, k2 in zip(keys, other_keys): 70a4e54154Smrg if k1 != k2: 71a4e54154Smrg return False 72a4e54154Smrg if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]): 73a4e54154Smrg return False 74a4e54154Smrg return True 75a4e54154Smrg 76a4e54154Smrg# Convert a file name into a name suitable for C declarations 77a4e54154Smrgdef get_name(file_name): 78a4e54154Smrg return file_name.split('.')[0] 79a4e54154Smrg 80a4e54154Smrg# Convert a C name into a language name 81a4e54154Smrgdef get_lang(c_name): 82a4e54154Smrg return c_name.replace('_', '-').replace(' ', '').lower() 83a4e54154Smrg 84a4e54154Smrgdef read_orth_file(file_name): 85a4e54154Smrg lines = [] 86a4e54154Smrg with open(file_name, 'r', encoding='utf-8') as orth_file: 87a4e54154Smrg for num, line in enumerate(orth_file): 88a4e54154Smrg if line.startswith('include '): 89a4e54154Smrg include_fn = line[8:].strip() 90a4e54154Smrg lines += read_orth_file(include_fn) 91a4e54154Smrg else: 92a4e54154Smrg # remove comments and strip whitespaces 93a4e54154Smrg line = line.split('#')[0].strip() 94a4e54154Smrg line = line.split('\t')[0].strip() 95a4e54154Smrg # skip empty lines 96a4e54154Smrg if line: 97a4e54154Smrg lines += [(file_name, num, line)] 98a4e54154Smrg 99a4e54154Smrg return lines 100a4e54154Smrg 101a4e54154Smrgdef leaves_equal(leaf1, leaf2): 102a4e54154Smrg for v1, v2 in zip(leaf1, leaf2): 103a4e54154Smrg if v1 != v2: 104a4e54154Smrg return False 105a4e54154Smrg return True 106a4e54154Smrg 107a4e54154Smrg# Build a single charset from a source file 108a4e54154Smrg# 109a4e54154Smrg# The file format is quite simple, either 110a4e54154Smrg# a single hex value or a pair separated with a dash 111a4e54154Smrgdef parse_orth_file(file_name, lines): 112a4e54154Smrg charset = CharSet() 113a4e54154Smrg for fn, num, line in lines: 114a4e54154Smrg delete_char = line.startswith('-') 115a4e54154Smrg if delete_char: 116a4e54154Smrg line = line[1:] 117a4e54154Smrg if line.find('-') != -1: 118a4e54154Smrg parts = line.split('-') 119a4e54154Smrg elif line.find('..') != -1: 120a4e54154Smrg parts = line.split('..') 121a4e54154Smrg else: 122a4e54154Smrg parts = [line] 123a4e54154Smrg 124a4e54154Smrg start = int(parts.pop(0), 16) 125a4e54154Smrg end = start 126a4e54154Smrg if parts: 127a4e54154Smrg end = int(parts.pop(0), 16) 128a4e54154Smrg if parts: 129a4e54154Smrg print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num)) 130a4e54154Smrg 131a4e54154Smrg for ucs4 in range(start, end+1): 132a4e54154Smrg if delete_char: 133a4e54154Smrg charset.del_char(ucs4) 134a4e54154Smrg else: 135a4e54154Smrg charset.add_char(ucs4) 136a4e54154Smrg 137a4e54154Smrg assert charset.equals(charset) # sanity check for the equals function 138a4e54154Smrg 139a4e54154Smrg return charset 140a4e54154Smrg 141a4e54154Smrgif __name__=='__main__': 142a4e54154Smrg parser = argparse.ArgumentParser() 143a4e54154Smrg parser.add_argument('orth_files', nargs='+', help='List of .orth files') 144a4e54154Smrg parser.add_argument('--directory', dest='directory', default=None) 145a4e54154Smrg parser.add_argument('--template', dest='template_file', default=None) 146a4e54154Smrg parser.add_argument('--output', dest='output_file', default=None) 147a4e54154Smrg 148a4e54154Smrg args = parser.parse_args() 149a4e54154Smrg 150a4e54154Smrg sets = [] 151a4e54154Smrg names = [] 152a4e54154Smrg langs = [] 153a4e54154Smrg country = [] 154a4e54154Smrg 155a4e54154Smrg total_leaves = 0 156a4e54154Smrg 157a4e54154Smrg LangCountrySets = {} 158a4e54154Smrg 159a4e54154Smrg # Open output file 160a4e54154Smrg if args.output_file: 161a4e54154Smrg sys.stdout = open(args.output_file, 'w', encoding='utf-8') 162a4e54154Smrg 163a4e54154Smrg # Read the template file 164a4e54154Smrg if args.template_file: 165a4e54154Smrg tmpl_file = open(args.template_file, 'r', encoding='utf-8') 166a4e54154Smrg else: 167a4e54154Smrg tmpl_file = sys.stdin 168a4e54154Smrg 169a4e54154Smrg # Change into source dir if specified (after opening other files) 170a4e54154Smrg if args.directory: 171a4e54154Smrg os.chdir(args.directory) 172a4e54154Smrg 173a4e54154Smrg orth_entries = {} 174a4e54154Smrg for i, fn in enumerate(args.orth_files): 175a4e54154Smrg orth_entries[fn] = i 176a4e54154Smrg 177a4e54154Smrg for fn in sorted(orth_entries.keys()): 178a4e54154Smrg lines = read_orth_file(fn) 179a4e54154Smrg charset = parse_orth_file(fn, lines) 180a4e54154Smrg 181a4e54154Smrg sets.append(charset) 182a4e54154Smrg 183a4e54154Smrg name = get_name(fn) 184a4e54154Smrg names.append(name) 185a4e54154Smrg 186a4e54154Smrg lang = get_lang(name) 187a4e54154Smrg langs.append(lang) 188a4e54154Smrg if lang.find('-') != -1: 189a4e54154Smrg country.append(orth_entries[fn]) # maps to original index 190a4e54154Smrg language_family = lang.split('-')[0] 191a4e54154Smrg if not language_family in LangCountrySets: 192a4e54154Smrg LangCountrySets[language_family] = [] 193a4e54154Smrg LangCountrySets[language_family] += [orth_entries[fn]] 194a4e54154Smrg 195a4e54154Smrg total_leaves += len(charset.leaves) 196a4e54154Smrg 197a4e54154Smrg # Find unique leaves 198a4e54154Smrg leaves = [] 199a4e54154Smrg for s in sets: 200a4e54154Smrg for leaf_num in sorted(s.leaves.keys()): 201a4e54154Smrg leaf = s.leaves[leaf_num] 202a4e54154Smrg is_unique = True 203a4e54154Smrg for existing_leaf in leaves: 204a4e54154Smrg if leaves_equal(leaf, existing_leaf): 205a4e54154Smrg is_unique = False 206a4e54154Smrg break 207a4e54154Smrg #print('unique: ', is_unique) 208a4e54154Smrg if is_unique: 209a4e54154Smrg leaves.append(leaf) 210a4e54154Smrg 211a4e54154Smrg # Find duplicate charsets 212a4e54154Smrg duplicate = [] 213a4e54154Smrg for i, s in enumerate(sets): 214a4e54154Smrg dup_num = None 215a4e54154Smrg if i >= 1: 216a4e54154Smrg for j, s_cmp in enumerate(sets): 217a4e54154Smrg if j >= i: 218a4e54154Smrg break 219a4e54154Smrg if s_cmp.equals(s): 220a4e54154Smrg dup_num = j 221a4e54154Smrg break 222a4e54154Smrg 223a4e54154Smrg duplicate.append(dup_num) 224a4e54154Smrg 225a4e54154Smrg tn = 0 226a4e54154Smrg off = {} 227a4e54154Smrg for i, s in enumerate(sets): 228a4e54154Smrg if duplicate[i]: 229a4e54154Smrg continue 230a4e54154Smrg off[i] = tn 231a4e54154Smrg tn += len(s.leaves) 232a4e54154Smrg 233a4e54154Smrg # Scan the input until the marker is found 234a4e54154Smrg # FIXME: this is a bit silly really, might just as well hardcode 235a4e54154Smrg # the license header in the script and drop the template 236a4e54154Smrg for line in tmpl_file: 237a4e54154Smrg if line.strip() == '@@@': 238a4e54154Smrg break 239a4e54154Smrg print(line, end='') 240a4e54154Smrg 241a4e54154Smrg print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves))) 242a4e54154Smrg 243a4e54154Smrg print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets))) 244a4e54154Smrg print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves))) 245a4e54154Smrg print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn)) 246a4e54154Smrg print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))') 247a4e54154Smrg print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))') 248a4e54154Smrg print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))') 249a4e54154Smrg print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))') 250a4e54154Smrg print('#define fcLangCharSets (fcLangData.langCharSets)') 251a4e54154Smrg print('#define fcLangCharSetIndices (fcLangData.langIndices)') 252a4e54154Smrg print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)') 253a4e54154Smrg 254ae02b298Smrg assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then 255a4e54154Smrg 256a4e54154Smrg print(''' 257a4e54154Smrgstatic const struct {{ 258a4e54154Smrg FcLangCharSet langCharSets[{}]; 259a4e54154Smrg FcCharLeaf leaves[{}]; 260a4e54154Smrg uintptr_t leaf_offsets[{}]; 261a4e54154Smrg FcChar16 numbers[{}]; 262a4e54154Smrg {} langIndices[{}]; 263a4e54154Smrg {} langIndicesInv[{}]; 264a4e54154Smrg}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn, 265ae02b298Smrg 'FcChar16 ', len(sets), 'FcChar16 ', len(sets))) 266a4e54154Smrg 267a4e54154Smrg # Dump sets 268a4e54154Smrg print('{') 269a4e54154Smrg for i, s in enumerate(sets): 270a4e54154Smrg if duplicate[i]: 271a4e54154Smrg j = duplicate[i] 272a4e54154Smrg else: 273a4e54154Smrg j = i 274a4e54154Smrg print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format( 275a4e54154Smrg langs[i], len(sets[j].leaves), i, off[j], i, off[j], i)) 276a4e54154Smrg 277a4e54154Smrg print('},') 278a4e54154Smrg 279a4e54154Smrg # Dump leaves 280a4e54154Smrg print('{') 281a4e54154Smrg for l, leaf in enumerate(leaves): 282a4e54154Smrg print(' {{ {{ /* {} */'.format(l), end='') 283a4e54154Smrg for i in range(0, 8): # 256/32 = 8 284a4e54154Smrg if i % 4 == 0: 285a4e54154Smrg print('\n ', end='') 286a4e54154Smrg print(' 0x{:08x},'.format(leaf[i]), end='') 287a4e54154Smrg print('\n } },') 288a4e54154Smrg print('},') 289a4e54154Smrg 290a4e54154Smrg # Dump leaves 291a4e54154Smrg print('{') 292a4e54154Smrg for i, s in enumerate(sets): 293a4e54154Smrg if duplicate[i]: 294a4e54154Smrg continue 295a4e54154Smrg 296a4e54154Smrg print(' /* {} */'.format(names[i])) 297a4e54154Smrg 298a4e54154Smrg for n, leaf_num in enumerate(sorted(s.leaves.keys())): 299a4e54154Smrg leaf = s.leaves[leaf_num] 300a4e54154Smrg if n % 4 == 0: 301a4e54154Smrg print(' ', end='') 302a4e54154Smrg found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)] 303a4e54154Smrg assert found, "Couldn't find leaf in unique leaves list!" 304a4e54154Smrg assert len(found) == 1 305a4e54154Smrg print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='') 306a4e54154Smrg if n % 4 == 3: 307a4e54154Smrg print('') 308a4e54154Smrg if len(s.leaves) % 4 != 0: 309a4e54154Smrg print('') 310a4e54154Smrg 311a4e54154Smrg print('},') 312a4e54154Smrg 313a4e54154Smrg print('{') 314a4e54154Smrg for i, s in enumerate(sets): 315a4e54154Smrg if duplicate[i]: 316a4e54154Smrg continue 317a4e54154Smrg 318a4e54154Smrg print(' /* {} */'.format(names[i])) 319a4e54154Smrg 320a4e54154Smrg for n, leaf_num in enumerate(sorted(s.leaves.keys())): 321a4e54154Smrg leaf = s.leaves[leaf_num] 322a4e54154Smrg if n % 8 == 0: 323a4e54154Smrg print(' ', end='') 324a4e54154Smrg print(' 0x{:04x},'.format(leaf_num), end='') 325a4e54154Smrg if n % 8 == 7: 326a4e54154Smrg print('') 327a4e54154Smrg if len(s.leaves) % 8 != 0: 328a4e54154Smrg print('') 329a4e54154Smrg 330a4e54154Smrg print('},') 331a4e54154Smrg 332a4e54154Smrg # langIndices 333a4e54154Smrg print('{') 334a4e54154Smrg for i, s in enumerate(sets): 335a4e54154Smrg fn = '{}.orth'.format(names[i]) 336a4e54154Smrg print(' {}, /* {} */'.format(orth_entries[fn], names[i])) 337a4e54154Smrg print('},') 338a4e54154Smrg 339a4e54154Smrg # langIndicesInv 340a4e54154Smrg print('{') 341a4e54154Smrg for i, k in enumerate(orth_entries.keys()): 342a4e54154Smrg name = get_name(k) 343a4e54154Smrg idx = names.index(name) 344a4e54154Smrg print(' {}, /* {} */'.format(idx, name)) 345a4e54154Smrg print('}') 346a4e54154Smrg 347a4e54154Smrg print('};\n') 348a4e54154Smrg 349a4e54154Smrg print('#define NUM_LANG_CHAR_SET {}'.format(len(sets))) 350a4e54154Smrg num_lang_set_map = (len(sets) + 31) // 32; 351a4e54154Smrg print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map)) 352a4e54154Smrg 353a4e54154Smrg # Dump indices with country codes 354a4e54154Smrg assert len(country) > 0 355a4e54154Smrg assert len(LangCountrySets) > 0 356a4e54154Smrg print('') 357a4e54154Smrg print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {') 358a4e54154Smrg for k in sorted(LangCountrySets.keys()): 359a4e54154Smrg langset_map = [0] * num_lang_set_map # initialise all zeros 360a4e54154Smrg for entries_id in LangCountrySets[k]: 361a4e54154Smrg langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f)) 362a4e54154Smrg print(' {', end='') 363a4e54154Smrg for v in langset_map: 364a4e54154Smrg print(' 0x{:08x},'.format(v), end='') 365a4e54154Smrg print(' }}, /* {} */'.format(k)) 366a4e54154Smrg 367a4e54154Smrg print('};\n') 368a4e54154Smrg print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets))) 369a4e54154Smrg 370a4e54154Smrg # Find ranges for each letter for faster searching 371a4e54154Smrg # Dump sets start/finish for the fastpath 372a4e54154Smrg print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n') 373a4e54154Smrg for c in string.ascii_lowercase: # a-z 374a4e54154Smrg start = 9999 375a4e54154Smrg stop = -1 376a4e54154Smrg for i, s in enumerate(sets): 377a4e54154Smrg if names[i].startswith(c): 378a4e54154Smrg start = min(start,i) 379a4e54154Smrg stop = max(stop,i) 380a4e54154Smrg print(' {{ {}, {} }}, /* {} */'.format(start, stop, c)) 381a4e54154Smrg print('};\n') 382a4e54154Smrg 383a4e54154Smrg # And flush out the rest of the input file 384a4e54154Smrg for line in tmpl_file: 385a4e54154Smrg print(line, end='') 386a4e54154Smrg 387a4e54154Smrg sys.stdout.flush() 388