1#!/usr/bin/env python3 2# 3# fontconfig/fc-lang/fc-lang.py 4# 5# Copyright © 2001-2002 Keith Packard 6# Copyright © 2019 Tim-Philipp Müller 7# 8# Permission to use, copy, modify, distribute, and sell this software and its 9# documentation for any purpose is hereby granted without fee, provided that 10# the above copyright notice appear in all copies and that both that 11# copyright notice and this permission notice appear in supporting 12# documentation, and that the name of the author(s) not be used in 13# advertising or publicity pertaining to distribution of the software without 14# specific, written prior permission. The authors make no 15# representations about the suitability of this software for any purpose. It 16# is provided "as is" without express or implied warranty. 17# 18# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 19# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO 20# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR 21# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 22# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 23# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 24# PERFORMANCE OF THIS SOFTWARE. 25 26# fc-lang 27# 28# Read a set of language orthographies and build C declarations for 29# charsets which can then be used to identify which languages are 30# supported by a given font. 31# 32# TODO: this code is not very pythonic, a lot of it is a 1:1 translation 33# of the C code and we could probably simplify it a bit 34import argparse 35import string 36import sys 37import os 38 39# we just store the leaves in a dict, we can order the leaves later if needed 40class CharSet: 41 def __init__(self): 42 self.leaves = {} # leaf_number -> leaf data (= 16 uint32) 43 44 def add_char(self, ucs4): 45 assert ucs4 < 0x01000000 46 leaf_num = ucs4 >> 8 47 if leaf_num in self.leaves: 48 leaf = self.leaves[leaf_num] 49 else: 50 leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8 51 self.leaves[leaf_num] = leaf 52 leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f)) 53 #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) 54 55 def del_char(self, ucs4): 56 assert ucs4 < 0x01000000 57 leaf_num = ucs4 >> 8 58 if leaf_num in self.leaves: 59 leaf = self.leaves[leaf_num] 60 leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f)) 61 # We don't bother removing the leaf if it's empty */ 62 #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) 63 64 def equals(self, other_cs): 65 keys = sorted(self.leaves.keys()) 66 other_keys = sorted(other_cs.leaves.keys()) 67 if len(keys) != len(other_keys): 68 return False 69 for k1, k2 in zip(keys, other_keys): 70 if k1 != k2: 71 return False 72 if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]): 73 return False 74 return True 75 76# Convert a file name into a name suitable for C declarations 77def get_name(file_name): 78 return file_name.split('.')[0] 79 80# Convert a C name into a language name 81def get_lang(c_name): 82 return c_name.replace('_', '-').replace(' ', '').lower() 83 84def read_orth_file(file_name): 85 lines = [] 86 with open(file_name, 'r', encoding='utf-8') as orth_file: 87 for num, line in enumerate(orth_file): 88 if line.startswith('include '): 89 include_fn = line[8:].strip() 90 lines += read_orth_file(include_fn) 91 else: 92 # remove comments and strip whitespaces 93 line = line.split('#')[0].strip() 94 line = line.split('\t')[0].strip() 95 # skip empty lines 96 if line: 97 lines += [(file_name, num, line)] 98 99 return lines 100 101def leaves_equal(leaf1, leaf2): 102 for v1, v2 in zip(leaf1, leaf2): 103 if v1 != v2: 104 return False 105 return True 106 107# Build a single charset from a source file 108# 109# The file format is quite simple, either 110# a single hex value or a pair separated with a dash 111def parse_orth_file(file_name, lines): 112 charset = CharSet() 113 for fn, num, line in lines: 114 delete_char = line.startswith('-') 115 if delete_char: 116 line = line[1:] 117 if line.find('-') != -1: 118 parts = line.split('-') 119 elif line.find('..') != -1: 120 parts = line.split('..') 121 else: 122 parts = [line] 123 124 start = int(parts.pop(0), 16) 125 end = start 126 if parts: 127 end = int(parts.pop(0), 16) 128 if parts: 129 print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num)) 130 131 for ucs4 in range(start, end+1): 132 if delete_char: 133 charset.del_char(ucs4) 134 else: 135 charset.add_char(ucs4) 136 137 assert charset.equals(charset) # sanity check for the equals function 138 139 return charset 140 141if __name__=='__main__': 142 parser = argparse.ArgumentParser() 143 parser.add_argument('orth_files', nargs='+', help='List of .orth files') 144 parser.add_argument('--directory', dest='directory', default=None) 145 parser.add_argument('--template', dest='template_file', default=None) 146 parser.add_argument('--output', dest='output_file', default=None) 147 148 args = parser.parse_args() 149 150 sets = [] 151 names = [] 152 langs = [] 153 country = [] 154 155 total_leaves = 0 156 157 LangCountrySets = {} 158 159 # Open output file 160 if args.output_file: 161 sys.stdout = open(args.output_file, 'w', encoding='utf-8') 162 163 # Read the template file 164 if args.template_file: 165 tmpl_file = open(args.template_file, 'r', encoding='utf-8') 166 else: 167 tmpl_file = sys.stdin 168 169 # Change into source dir if specified (after opening other files) 170 if args.directory: 171 os.chdir(args.directory) 172 173 orth_entries = {} 174 for i, fn in enumerate(args.orth_files): 175 orth_entries[fn] = i 176 177 for fn in sorted(orth_entries.keys()): 178 lines = read_orth_file(fn) 179 charset = parse_orth_file(fn, lines) 180 181 sets.append(charset) 182 183 name = get_name(fn) 184 names.append(name) 185 186 lang = get_lang(name) 187 langs.append(lang) 188 if lang.find('-') != -1: 189 country.append(orth_entries[fn]) # maps to original index 190 language_family = lang.split('-')[0] 191 if not language_family in LangCountrySets: 192 LangCountrySets[language_family] = [] 193 LangCountrySets[language_family] += [orth_entries[fn]] 194 195 total_leaves += len(charset.leaves) 196 197 # Find unique leaves 198 leaves = [] 199 for s in sets: 200 for leaf_num in sorted(s.leaves.keys()): 201 leaf = s.leaves[leaf_num] 202 is_unique = True 203 for existing_leaf in leaves: 204 if leaves_equal(leaf, existing_leaf): 205 is_unique = False 206 break 207 #print('unique: ', is_unique) 208 if is_unique: 209 leaves.append(leaf) 210 211 # Find duplicate charsets 212 duplicate = [] 213 for i, s in enumerate(sets): 214 dup_num = None 215 if i >= 1: 216 for j, s_cmp in enumerate(sets): 217 if j >= i: 218 break 219 if s_cmp.equals(s): 220 dup_num = j 221 break 222 223 duplicate.append(dup_num) 224 225 tn = 0 226 off = {} 227 for i, s in enumerate(sets): 228 if duplicate[i]: 229 continue 230 off[i] = tn 231 tn += len(s.leaves) 232 233 # Scan the input until the marker is found 234 # FIXME: this is a bit silly really, might just as well hardcode 235 # the license header in the script and drop the template 236 for line in tmpl_file: 237 if line.strip() == '@@@': 238 break 239 print(line, end='') 240 241 print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves))) 242 243 print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets))) 244 print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves))) 245 print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn)) 246 print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))') 247 print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))') 248 print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))') 249 print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))') 250 print('#define fcLangCharSets (fcLangData.langCharSets)') 251 print('#define fcLangCharSetIndices (fcLangData.langIndices)') 252 print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)') 253 254 assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then 255 256 print(''' 257static const struct {{ 258 FcLangCharSet langCharSets[{}]; 259 FcCharLeaf leaves[{}]; 260 uintptr_t leaf_offsets[{}]; 261 FcChar16 numbers[{}]; 262 {} langIndices[{}]; 263 {} langIndicesInv[{}]; 264}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn, 265 'FcChar16 ', len(sets), 'FcChar16 ', len(sets))) 266 267 # Dump sets 268 print('{') 269 for i, s in enumerate(sets): 270 if duplicate[i]: 271 j = duplicate[i] 272 else: 273 j = i 274 print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format( 275 langs[i], len(sets[j].leaves), i, off[j], i, off[j], i)) 276 277 print('},') 278 279 # Dump leaves 280 print('{') 281 for l, leaf in enumerate(leaves): 282 print(' {{ {{ /* {} */'.format(l), end='') 283 for i in range(0, 8): # 256/32 = 8 284 if i % 4 == 0: 285 print('\n ', end='') 286 print(' 0x{:08x},'.format(leaf[i]), end='') 287 print('\n } },') 288 print('},') 289 290 # Dump leaves 291 print('{') 292 for i, s in enumerate(sets): 293 if duplicate[i]: 294 continue 295 296 print(' /* {} */'.format(names[i])) 297 298 for n, leaf_num in enumerate(sorted(s.leaves.keys())): 299 leaf = s.leaves[leaf_num] 300 if n % 4 == 0: 301 print(' ', end='') 302 found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)] 303 assert found, "Couldn't find leaf in unique leaves list!" 304 assert len(found) == 1 305 print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='') 306 if n % 4 == 3: 307 print('') 308 if len(s.leaves) % 4 != 0: 309 print('') 310 311 print('},') 312 313 print('{') 314 for i, s in enumerate(sets): 315 if duplicate[i]: 316 continue 317 318 print(' /* {} */'.format(names[i])) 319 320 for n, leaf_num in enumerate(sorted(s.leaves.keys())): 321 leaf = s.leaves[leaf_num] 322 if n % 8 == 0: 323 print(' ', end='') 324 print(' 0x{:04x},'.format(leaf_num), end='') 325 if n % 8 == 7: 326 print('') 327 if len(s.leaves) % 8 != 0: 328 print('') 329 330 print('},') 331 332 # langIndices 333 print('{') 334 for i, s in enumerate(sets): 335 fn = '{}.orth'.format(names[i]) 336 print(' {}, /* {} */'.format(orth_entries[fn], names[i])) 337 print('},') 338 339 # langIndicesInv 340 print('{') 341 for i, k in enumerate(orth_entries.keys()): 342 name = get_name(k) 343 idx = names.index(name) 344 print(' {}, /* {} */'.format(idx, name)) 345 print('}') 346 347 print('};\n') 348 349 print('#define NUM_LANG_CHAR_SET {}'.format(len(sets))) 350 num_lang_set_map = (len(sets) + 31) // 32; 351 print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map)) 352 353 # Dump indices with country codes 354 assert len(country) > 0 355 assert len(LangCountrySets) > 0 356 print('') 357 print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {') 358 for k in sorted(LangCountrySets.keys()): 359 langset_map = [0] * num_lang_set_map # initialise all zeros 360 for entries_id in LangCountrySets[k]: 361 langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f)) 362 print(' {', end='') 363 for v in langset_map: 364 print(' 0x{:08x},'.format(v), end='') 365 print(' }}, /* {} */'.format(k)) 366 367 print('};\n') 368 print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets))) 369 370 # Find ranges for each letter for faster searching 371 # Dump sets start/finish for the fastpath 372 print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n') 373 for c in string.ascii_lowercase: # a-z 374 start = 9999 375 stop = -1 376 for i, s in enumerate(sets): 377 if names[i].startswith(c): 378 start = min(start,i) 379 stop = max(stop,i) 380 print(' {{ {}, {} }}, /* {} */'.format(start, stop, c)) 381 print('};\n') 382 383 # And flush out the rest of the input file 384 for line in tmpl_file: 385 print(line, end='') 386 387 sys.stdout.flush() 388