fc-case.py revision a4e54154
1#!/usr/bin/env python3 2# 3# fontconfig/fc-case/fc-case.py 4# 5# Copyright © 2004 Keith Packard 6# Copyright © 2019 Tim-Philipp Müller 7# 8# Permission to use, copy, modify, distribute, and sell this software and its 9# documentation for any purpose is hereby granted without fee, provided that 10# the above copyright notice appear in all copies and that both that 11# copyright notice and this permission notice appear in supporting 12# documentation, and that the name of the author(s) not be used in 13# advertising or publicity pertaining to distribution of the software without 14# specific, written prior permission. The authors make no 15# representations about the suitability of this software for any purpose. It 16# is provided "as is" without express or implied warranty. 17# 18# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 19# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO 20# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR 21# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 22# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 23# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 24# PERFORMANCE OF THIS SOFTWARE. 25 26from enum import Enum 27import argparse 28import string 29import sys 30 31class CaseFoldClass(Enum): 32 COMMON = 1 33 FULL = 2 34 SIMPLE = 3 35 TURKIC = 4 36 37class CaseFoldMethod(Enum): 38 RANGE = 0 39 EVEN_ODD = 1 40 FULL = 2 41 42caseFoldClassMap = { 43 'C' : CaseFoldClass.COMMON, 44 'F' : CaseFoldClass.FULL, 45 'S' : CaseFoldClass.SIMPLE, 46 'T' : CaseFoldClass.TURKIC 47} 48 49folds = [] 50 51def ucs4_to_utf8(ucs4): 52 utf8_rep = [] 53 54 if ucs4 < 0x80: 55 utf8_rep.append(ucs4) 56 bits = -6 57 elif ucs4 < 0x800: 58 utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0) 59 bits = 0 60 elif ucs4 < 0x10000: 61 utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0) 62 bits = 6 63 elif ucs4 < 0x200000: 64 utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0) 65 bits = 12 66 elif ucs4 < 0x4000000: 67 utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8) 68 bits = 18 69 elif ucs4 < 0x80000000: 70 utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC) 71 bits = 24 72 else: 73 return []; 74 75 while bits >= 0: 76 utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80) 77 bits-= 6 78 79 return utf8_rep 80 81def utf8_size(ucs4): 82 return len(ucs4_to_utf8(ucs4)) 83 84case_fold_method_name_map = { 85 CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,', 86 CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,', 87 CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,', 88} 89 90if __name__=='__main__': 91 parser = argparse.ArgumentParser() 92 parser.add_argument('case_folding_file') 93 parser.add_argument('--template', dest='template_file', default=None) 94 parser.add_argument('--output', dest='output_file', default=None) 95 96 args = parser.parse_args() 97 98 minFoldChar = None 99 maxFoldChar = None 100 fold = None 101 102 foldChars = [] 103 maxFoldChars = 0 104 105 maxExpand = 0 106 107 # Read the standard Unicode CaseFolding.txt file 108 with open(args.case_folding_file, 'r', encoding='utf-8') as casefile: 109 for cnt, line in enumerate(casefile): 110 if not line or not line[0] in string.hexdigits: 111 continue 112 113 # print('Line {}: {}'.format(cnt, line.strip())) 114 115 tokens = line.split('; ') 116 117 if len(tokens) < 3: 118 print('Not enough tokens in line {}'.format(cnt), file=sys.stderr) 119 sys.exit(1) 120 121 # Get upper case value 122 upper = int(tokens.pop(0), 16) 123 124 # Get class 125 cfclass = caseFoldClassMap[tokens.pop(0)] 126 127 # Get list of result characters 128 lower = list(map(lambda s: int(s,16), tokens.pop(0).split())) 129 130 # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower)) 131 132 if not minFoldChar: 133 minFoldChar = upper 134 135 maxFoldChar = upper; 136 137 if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]: 138 if len(lower) == 1: 139 # foldExtends 140 if fold and fold['method'] == CaseFoldMethod.RANGE: 141 foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count'] 142 elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD: 143 foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1) 144 else: 145 foldExtends = False 146 147 if foldExtends: 148 # This modifies the last fold item in the array too 149 fold['count'] = upper - fold['upper'] + 1; 150 else: 151 fold = {} 152 fold['upper'] = upper 153 fold['offset'] = lower[0] - upper; 154 if fold['offset'] == 1: 155 fold['method'] = CaseFoldMethod.EVEN_ODD 156 else: 157 fold['method'] = CaseFoldMethod.RANGE 158 fold['count'] = 1 159 folds.append(fold) 160 expand = utf8_size (lower[0]) - utf8_size(upper) 161 else: 162 fold = {} 163 fold['upper'] = upper 164 fold['method'] = CaseFoldMethod.FULL 165 fold['offset'] = len(foldChars) 166 167 # add chars 168 for c in lower: 169 utf8_rep = ucs4_to_utf8(c) 170 # print('{} -> {}'.format(c,utf8_rep)) 171 for utf8_char in utf8_rep: 172 foldChars.append(utf8_char) 173 174 fold['count'] = len(foldChars) - fold['offset'] 175 folds.append(fold) 176 177 if fold['count'] > maxFoldChars: 178 maxFoldChars = fold['count'] 179 180 expand = fold['count'] - utf8_size(upper) 181 if expand > maxExpand: 182 maxExpand = expand 183 184 # Open output file 185 if args.output_file: 186 sys.stdout = open(args.output_file, 'w', encoding='utf-8') 187 188 # Read the template file 189 if args.template_file: 190 tmpl_file = open(args.template_file, 'r', encoding='utf-8') 191 else: 192 tmpl_file = sys.stdin 193 194 # Scan the input until the marker is found 195 # FIXME: this is a bit silly really, might just as well harcode 196 # the license header in the script and drop the template 197 for line in tmpl_file: 198 if line.strip() == '@@@': 199 break 200 print(line, end='') 201 202 # Dump these tables 203 print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds))) 204 print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars))) 205 print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars)) 206 print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand)) 207 print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar)) 208 print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar)) 209 print('') 210 211 # Dump out ranges 212 print('static const FcCaseFold fcCaseFold[FC_NUM_CASE_FOLD] = {') 213 for f in folds: 214 short_offset = f['offset'] 215 if short_offset < -32367: 216 short_offset += 65536 217 if short_offset > 32368: 218 short_offset -= 65536 219 print(' {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{', 220 f['upper'], case_fold_method_name_map[f['method']], 221 f['count'], short_offset, '}')) 222 print('};\n') 223 224 # Dump out "other" values 225 print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {') 226 for n, c in enumerate(foldChars): 227 if n == len(foldChars) - 1: 228 end = '' 229 elif n % 16 == 15: 230 end = ',\n' 231 else: 232 end = ',' 233 print('0x{:02x}'.format(c), end=end) 234 print('\n};') 235 236 # And flush out the rest of the input file 237 for line in tmpl_file: 238 print(line, end='') 239 240 sys.stdout.flush() 241