charclass.c revision f2e35a3a
1/* $XTermId: charclass.c,v 1.44 2021/02/02 00:19:32 tom Exp $ */ 2 3/* 4 * Copyright 2002-2020,2021 by Thomas E. Dickey 5 * 6 * All Rights Reserved 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the 10 * "Software"), to deal in the Software without restriction, including 11 * without limitation the rights to use, copy, modify, merge, publish, 12 * distribute, sublicense, and/or sell copies of the Software, and to 13 * permit persons to whom the Software is furnished to do so, subject to 14 * the following conditions: 15 * 16 * The above copyright notice and this permission notice shall be included 17 * in all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * Except as contained in this notice, the name(s) of the above copyright 28 * holders shall not be used in advertising or otherwise to promote the 29 * sale, use or other dealings in this Software without prior written 30 * authorization. 31 * 32 *---------------------------------------------------------------------------- 33 * Compact and efficient reimplementation of the 34 * xterm character class mechanism for large character sets 35 * 36 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 37 * 38 * xterm allows users to select entire words with a double-click on the left 39 * mouse button. Opinions might differ on what type of characters are part of 40 * separate words, therefore xterm allows users to configure a class code for 41 * each 8-bit character. Words are maximum length sequences of neighboring 42 * characters with identical class code. Extending this mechanism to Unicode 43 * naively would create an at least 2^16 entries (128 kB) long class code 44 * table. 45 * 46 * Instead, we transform the character class table into a list of intervals, 47 * that will be accessed via a linear search. Changes made to the table by the 48 * user will be appended. A special class code IDENT (default) marks 49 * characters who have their code number as the class code. 50 * 51 * We could alternatively use a sorted table of non-overlapping intervals that 52 * can be accessed via binary search, but merging in new intervals is 53 * significantly more hassle and not worth the effort here. 54 */ 55 56#include <xterm.h> 57#include <charclass.h> 58 59#if OPT_WIDE_CHARS 60 61#ifdef TEST_DRIVER 62 63#include <ctype.h> 64#include <wchar.h> 65#include <wctype.h> 66 67#if OPT_TRACE 68#define Trace if (opt_v) printf 69#endif 70 71#undef OPT_REPORT_CCLASS 72#define OPT_REPORT_CCLASS 1 73#endif /* TEST_DRIVER */ 74 75static struct classentry { 76 int cclass; 77 int first; 78 int last; 79} *classtab; 80 81typedef enum { 82 IDENT = -1, 83 OTHER = 0, 84 CNTRL = 1, 85 ALNUM = 48, 86 BLANK = 32, 87 U_CJK = 0x4e00, 88 U_SUP = 0x2070, 89 U_SUB = 0x2080, 90 U_HIR = 0x3040, 91 U_KAT = 0x30a0, 92 U_HAN = 0xac00 93} Classes; 94 95#ifdef TEST_DRIVER 96static int opt_all; 97static int opt_check; 98static int opt_quiet; 99static int opt_v; 100#endif 101 102void 103init_classtab(void) 104{ 105 const int size = 50; 106 107 TRACE(("init_classtab " TRACE_L "\n")); 108 109 classtab = TypeMallocN(struct classentry, (unsigned) size); 110 if (!classtab) 111 abort(); 112 classtab[0].cclass = size; 113 classtab[0].first = 1; 114 classtab[0].last = 0; 115 116 /* old xterm default classes */ 117 SetCharacterClassRange(0, 0, BLANK); 118 SetCharacterClassRange(1, 31, CNTRL); 119 SetCharacterClassRange('\t', '\t', BLANK); 120 SetCharacterClassRange('0', '9', ALNUM); 121 SetCharacterClassRange('A', 'Z', ALNUM); 122 SetCharacterClassRange('_', '_', ALNUM); 123 SetCharacterClassRange('a', 'z', ALNUM); 124 SetCharacterClassRange(127, 159, CNTRL); 125 SetCharacterClassRange(160, 191, IDENT); 126 SetCharacterClassRange(192, 255, ALNUM); 127 SetCharacterClassRange(215, 215, IDENT); 128 SetCharacterClassRange(247, 247, IDENT); 129 130 /* added Unicode classes */ 131 SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */ 132 SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */ 133 SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */ 134 SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */ 135 SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */ 136 SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */ 137 SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */ 138 SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */ 139 SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */ 140 SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */ 141 SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */ 142 SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */ 143 SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */ 144 SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */ 145 SetCharacterClassRange(0x2070, 0x207f, U_SUP); /* superscript */ 146 SetCharacterClassRange(0x2080, 0x208f, U_SUB); /* subscript */ 147 SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */ 148 SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */ 149 SetCharacterClassRange(0x3040, 0x309f, U_HIR); /* Hiragana */ 150 SetCharacterClassRange(0x30a0, 0x30ff, U_KAT); /* Katakana */ 151 SetCharacterClassRange(0x3300, 0x9fff, U_CJK); /* CJK Ideographs */ 152 SetCharacterClassRange(0xac00, 0xd7a3, U_HAN); /* Hangul Syllables */ 153 SetCharacterClassRange(0xf900, 0xfaff, U_CJK); /* CJK Ideographs */ 154 SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */ 155 SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */ 156 SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */ 157 SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */ 158 SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */ 159 160 TRACE((TRACE_R " init_classtab\n")); 161 return; 162} 163 164int 165CharacterClass(int c) 166{ 167 int i, cclass = IDENT; 168 169 for (i = classtab[0].first; i <= classtab[0].last; i++) 170 if (classtab[i].first <= c && classtab[i].last >= c) 171 cclass = classtab[i].cclass; 172 173 if (cclass < 0) 174 cclass = c; 175 176 return cclass; 177} 178 179#if OPT_REPORT_CCLASS 180#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d") 181static const char * 182class_name(Classes code) 183{ 184 static char buffer[80]; 185 const char *result = "?"; 186 switch (code) { 187 case ALNUM: 188 result = "ALNUM"; 189 break; 190 case BLANK: 191 result = "BLANK"; 192 break; 193 case CNTRL: 194 result = "CNTRL"; 195 break; 196 case OTHER: 197 result = "OTHER"; 198 break; 199 case IDENT: 200 result = "IDENT"; 201 break; 202 case U_SUP: 203 result = "superscript"; 204 break; 205 case U_SUB: 206 result = "subscript"; 207 break; 208 case U_CJK: 209 result = "CJK Ideographs"; 210 break; 211 case U_HIR: 212 result = "Hiragana"; 213 break; 214 case U_KAT: 215 result = "Katakana"; 216 break; 217 case U_HAN: 218 result = "Hangul Syllables"; 219 break; 220 default: 221 sprintf(buffer, charFormat(code), code); 222 result = buffer; 223 break; 224 } 225 return result; 226} 227 228/* 229 * Special convention for classtab[0]: 230 * - classtab[0].cclass is the allocated number of entries in classtab 231 * - classtab[0].first = 1 (first used entry in classtab) 232 * - classtab[0].last is the last used entry in classtab 233 */ 234 235int 236SetCharacterClassRange(int low, int high, int value) 237{ 238 TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n", 239 low, high, class_name(value))); 240 241 if (high < low) 242 return -1; /* nothing to do */ 243 244 /* make sure we have at least one free entry left at table end */ 245 if (classtab[0].last > classtab[0].cclass - 2) { 246 classtab[0].cclass += 5 + classtab[0].cclass / 4; 247 classtab = TypeRealloc(struct classentry, 248 (unsigned) classtab[0].cclass, classtab); 249 if (!classtab) 250 abort(); 251 } 252 253 /* simply append new interval to end of interval array */ 254 classtab[0].last++; 255 classtab[classtab[0].last].first = low; 256 classtab[classtab[0].last].last = high; 257 classtab[classtab[0].last].cclass = value; 258 259 return 0; 260} 261 262void 263report_wide_char_class(void) 264{ 265 static const Classes known_classes[] = 266 {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN}; 267 int i; 268 269 printf("\n"); 270 printf("Unicode charClass data uses the last match\n"); 271 printf("from these overlapping intervals of character codes:\n"); 272 for (i = classtab[0].first; i <= classtab[0].last; i++) { 273 printf("\tU+%04X .. U+%04X %s\n", 274 classtab[i].first, 275 classtab[i].last, 276 class_name((Classes) classtab[i].cclass)); 277 } 278 printf("\n"); 279 printf("These class-names are used internally (the first character code in a class):\n"); 280 for (i = 0; i < (int) XtNumber(known_classes); ++i) { 281 printf("\t"); 282 printf(charFormat(known_classes[i]), known_classes[i]); 283 printf(" = %s\n", class_name(known_classes[i])); 284 } 285} 286#endif /* OPT_REPORT_CCLASS */ 287 288#ifdef NO_LEAKS 289void 290noleaks_CharacterClass(void) 291{ 292 FreeAndNull(classtab); 293} 294#endif 295#endif /* OPT_WIDE_CHARS */ 296 297#ifdef TEST_DRIVER 298#if OPT_WIDE_CHARS 299static void 300usage(void) 301{ 302 static const char *msg[] = 303 { 304 "Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]", 305 "", 306 "Options:", 307 " -a show all data", 308 " -s show only summary", 309 " -v verbose" 310 }; 311 size_t n; 312 for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) { 313 fprintf(stderr, "%s\n", msg[n]); 314 } 315 exit(EXIT_FAILURE); 316} 317 318static int 319expected_class(int wch) 320{ 321 int result = wch; 322 wint_t ch = (wint_t) wch; 323 if (ch == '\0' || ch == '\t') { 324 result = BLANK; 325 } else if (iswcntrl(ch)) { 326 result = CNTRL; 327 } else if (iswspace(ch)) { 328 result = BLANK; 329 } else if (ch < 127) { 330 if (isalnum(ch) || ch == '_') { 331 result = ALNUM; 332 } 333 } else if (ch == 170 || ch == 181 || ch == 186) { 334 ; 335 } else if (iswalnum(ch)) { 336 result = ALNUM; 337 } 338 return result; 339} 340 341static int 342show_cclass_range(int lo, int hi) 343{ 344 int cclass = CharacterClass(lo); 345 int ident = (cclass == lo); 346 int more = 0; 347 if (ident) { 348 int ch; 349 for (ch = lo + 1; ch <= hi; ch++) { 350 if (CharacterClass(ch) != ch) { 351 ident = 0; 352 break; 353 } 354 } 355 if (ident && (hi < 255)) { 356 ch = hi + 1; 357 if (CharacterClass(ch) == ch) { 358 if (ch >= 255 || CharacterClass(ch + 1) != ch) { 359 more = 1; 360 } 361 } 362 } 363 } 364 if (!more) { 365 if (lo == hi) { 366 printf("\t%d", lo); 367 } else { 368 printf("\t%d-%d", lo, hi); 369 } 370 if (!ident) 371 printf(":%d", cclass); 372 if (hi < 255) 373 printf(", \\"); 374 printf("\n"); 375 } 376 return !more; 377} 378 379static void 380report_resource(int first, int last) 381{ 382 int class_p; 383 int ch; 384 int dh; 385 386 class_p = CharacterClass(dh = first); 387 for (ch = first; ch < last; ++ch) { 388 int class_c = CharacterClass(ch); 389 if (class_c != class_p) { 390 if (show_cclass_range(dh, ch - 1)) { 391 dh = ch; 392 class_p = class_c; 393 } 394 } 395 } 396 if (dh < last - 1) { 397 show_cclass_range(dh, last - 1); 398 } 399} 400 401static int 402decode_one(const char *source, char **target) 403{ 404 int result = -1; 405 long check; 406 int radix = 0; 407 if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') { 408 source += 2; 409 radix = 16; 410 } 411 check = strtol(source, target, radix); 412 if (*target != NULL && *target != source) 413 result = (int) check; 414 return result; 415} 416 417static int 418decode_range(const char *source, int *lo, int *hi) 419{ 420 int result = 0; 421 char *after1; 422 char *after2; 423 if ((*lo = decode_one(source, &after1)) >= 0) { 424 after1 += strspn(after1, ":-.\t "); 425 if ((*hi = decode_one(after1, &after2)) < 0) { 426 *hi = *lo; 427 } 428 result = 1; 429 } 430 return result; 431} 432 433static void 434do_range(const char *source) 435{ 436 int lo, hi; 437 if (decode_range(source, &lo, &hi)) { 438 if (opt_all) { 439 while (lo <= hi) { 440 int other_rc = CharacterClass(lo); 441 if (!opt_quiet) 442 printf("U+%04X\t%s\n", lo, class_name(other_rc)); 443 ++lo; 444 } 445 } else if (opt_check) { 446 while (lo <= hi) { 447 int expect = expected_class(lo); 448 int actual = CharacterClass(lo); 449 if (actual != expect) 450 printf("U+%04X\t%s ->%s\n", lo, 451 class_name(expect), 452 class_name(actual)); 453 ++lo; 454 } 455 } else { 456 printf("\"charClass\" resource for [%d..%d]:\n", lo, hi); 457 report_resource(lo, hi + 1); 458 } 459 } 460} 461#endif /* OPT_WIDE_CHARS */ 462 463/* 464 * TODO: add option to show do_range in hex 465 */ 466int 467main(int argc, char **argv ENVP_ARG) 468{ 469#if OPT_WIDE_CHARS 470 int ch; 471#endif 472 473 (void) argc; 474 (void) argv; 475 476#if OPT_WIDE_CHARS 477 setlocale(LC_ALL, ""); 478 while ((ch = getopt(argc, argv, "acsv")) != -1) { 479 switch (ch) { 480 case 'a': 481 opt_all = 1; 482 break; 483 case 'c': 484 opt_check = 1; 485 break; 486 case 's': 487 opt_quiet = 1; 488 break; 489 case 'v': 490 opt_v = 1; 491 break; 492 default: 493 usage(); 494 } 495 } 496 init_classtab(); 497 498 if (optind >= argc) { 499 do_range("0-255"); 500 } else { 501 while (optind < argc) { 502 do_range(argv[optind++]); 503 } 504 } 505 report_wide_char_class(); 506#else 507 printf("wide-character support is not configured\n"); 508#endif /* OPT_WIDE_CHARS */ 509 return 0; 510} 511#endif /* TEST_DRIVER */ 512