1/* $XTermId: charclass.c,v 1.50 2023/04/01 00:11:47 tom Exp $ */ 2 3/* 4 * Copyright 2002-2022,2023 by Thomas E. Dickey 5 * 6 * All Rights Reserved 7 * 8 * Permission is hereby granted, free of charge, to any person obtaining a 9 * copy of this software and associated documentation files (the 10 * "Software"), to deal in the Software without restriction, including 11 * without limitation the rights to use, copy, modify, merge, publish, 12 * distribute, sublicense, and/or sell copies of the Software, and to 13 * permit persons to whom the Software is furnished to do so, subject to 14 * the following conditions: 15 * 16 * The above copyright notice and this permission notice shall be included 17 * in all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * Except as contained in this notice, the name(s) of the above copyright 28 * holders shall not be used in advertising or otherwise to promote the 29 * sale, use or other dealings in this Software without prior written 30 * authorization. 31 * 32 *---------------------------------------------------------------------------- 33 * Compact and efficient reimplementation of the 34 * xterm character class mechanism for large character sets 35 * 36 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 37 * 38 * xterm allows users to select entire words with a double-click on the left 39 * mouse button. Opinions might differ on what type of characters are part of 40 * separate words, therefore xterm allows users to configure a class code for 41 * each 8-bit character. Words are maximum length sequences of neighboring 42 * characters with identical class code. Extending this mechanism to Unicode 43 * naively would create an at least 2^16 entries (128 kB) long class code 44 * table. 45 * 46 * Instead, we transform the character class table into a list of intervals, 47 * that will be accessed via a linear search. Changes made to the table by the 48 * user will be appended. A special class code IDENT (default) marks 49 * characters who have their code number as the class code. 50 * 51 * We could alternatively use a sorted table of non-overlapping intervals that 52 * can be accessed via binary search, but merging in new intervals is 53 * significantly more hassle and not worth the effort here. 54 */ 55 56#include <xterm.h> 57#include <charclass.h> 58 59#if OPT_WIDE_CHARS 60 61#ifdef TEST_DRIVER 62 63#include <ctype.h> 64#include <wchar.h> 65#include <wctype.h> 66 67#if OPT_TRACE 68#define Trace if (opt_v) printf 69#endif 70 71#undef OPT_REPORT_CCLASS 72#define OPT_REPORT_CCLASS 1 73#endif /* TEST_DRIVER */ 74 75static struct classentry { 76 int cclass; 77 int first; 78 int last; 79} *classtab; 80 81#ifdef TEST_DRIVER 82static int opt_all; 83static int opt_check; 84static int opt_quiet; 85static int opt_v; 86#endif 87 88void 89init_classtab(void) 90{ 91 const int size = 50; 92 93 TRACE(("init_classtab " TRACE_L "\n")); 94 95 classtab = TypeMallocN(struct classentry, (unsigned) size); 96 if (!classtab) 97 abort(); 98 classtab[0].cclass = size; 99 classtab[0].first = 1; 100 classtab[0].last = 0; 101 102 /* old xterm default classes */ 103 SetCharacterClassRange(0, 0, BLANK); 104 SetCharacterClassRange(1, 31, CNTRL); 105 SetCharacterClassRange('\t', '\t', BLANK); 106 SetCharacterClassRange('0', '9', ALNUM); 107 SetCharacterClassRange('A', 'Z', ALNUM); 108 SetCharacterClassRange('_', '_', ALNUM); 109 SetCharacterClassRange('a', 'z', ALNUM); 110 SetCharacterClassRange(127, 159, CNTRL); 111 SetCharacterClassRange(160, 191, IDENT); 112 SetCharacterClassRange(192, 255, ALNUM); 113 SetCharacterClassRange(215, 215, IDENT); 114 SetCharacterClassRange(247, 247, IDENT); 115 116 /* added Unicode classes */ 117 SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */ 118 SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */ 119 SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */ 120 SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */ 121 SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */ 122 SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */ 123 SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */ 124 SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */ 125 SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */ 126 SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */ 127 SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */ 128 SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */ 129 SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */ 130 SetCharacterClassRange(0x200b, 0x200f, CNTRL); /* formatting */ 131 SetCharacterClassRange(0x2010, 0x27ff, IDENT); /* punctuation and symbols */ 132 SetCharacterClassRange(0x202a, 0x202e, CNTRL); /* formatting */ 133 SetCharacterClassRange(0x2060, 0x206f, CNTRL); /* formatting */ 134 SetCharacterClassRange(0x2070, 0x207f, U_SUP); /* superscript */ 135 SetCharacterClassRange(0x2080, 0x208f, U_SUB); /* subscript */ 136 SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */ 137 SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */ 138 SetCharacterClassRange(0x3040, 0x309f, U_HIR); /* Hiragana */ 139 SetCharacterClassRange(0x30a0, 0x30ff, U_KAT); /* Katakana */ 140 SetCharacterClassRange(0x3300, 0x9fff, U_CJK); /* CJK Ideographs */ 141 SetCharacterClassRange(0xac00, 0xd7a3, U_HAN); /* Hangul Syllables */ 142 SetCharacterClassRange(0xf900, 0xfaff, U_CJK); /* CJK Ideographs */ 143 SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */ 144 SetCharacterClassRange(0xfeff, 0xfeff, CNTRL); /* formatting */ 145 SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */ 146 SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */ 147 SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */ 148 SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */ 149 SetCharacterClassRange(0xfff9, 0xfffb, CNTRL); /* formatting */ 150 151 TRACE((TRACE_R " init_classtab\n")); 152 return; 153} 154 155int 156CharacterClass(int c) 157{ 158 int i, cclass = IDENT; 159 160 for (i = classtab[0].first; i <= classtab[0].last; i++) 161 if (classtab[i].first <= c && classtab[i].last >= c) 162 cclass = classtab[i].cclass; 163 164 if (cclass < 0) 165 cclass = c; 166 167 return cclass; 168} 169 170#if OPT_REPORT_CCLASS 171#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d") 172static const char * 173class_name(Classes code) 174{ 175 static char buffer[80]; 176 const char *result = "?"; 177 switch (code) { 178 case ALNUM: 179 result = "ALNUM"; 180 break; 181 case BLANK: 182 result = "BLANK"; 183 break; 184 case CNTRL: 185 result = "CNTRL"; 186 break; 187 case OTHER: 188 result = "OTHER"; 189 break; 190 case IDENT: 191 result = "IDENT"; 192 break; 193 case U_SUP: 194 result = "superscript"; 195 break; 196 case U_SUB: 197 result = "subscript"; 198 break; 199 case U_CJK: 200 result = "CJK Ideographs"; 201 break; 202 case U_HIR: 203 result = "Hiragana"; 204 break; 205 case U_KAT: 206 result = "Katakana"; 207 break; 208 case U_HAN: 209 result = "Hangul Syllables"; 210 break; 211 default: 212 sprintf(buffer, charFormat(code), code); 213 result = buffer; 214 break; 215 } 216 return result; 217} 218 219/* 220 * Special convention for classtab[0]: 221 * - classtab[0].cclass is the allocated number of entries in classtab 222 * - classtab[0].first = 1 (first used entry in classtab) 223 * - classtab[0].last is the last used entry in classtab 224 */ 225 226int 227SetCharacterClassRange(int low, int high, int value) 228{ 229 TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n", 230 low, high, class_name(value))); 231 232 if (high < low) 233 return -1; /* nothing to do */ 234 235 /* make sure we have at least one free entry left at table end */ 236 if (classtab[0].last > classtab[0].cclass - 2) { 237 classtab[0].cclass += 5 + classtab[0].cclass / 4; 238 classtab = TypeRealloc(struct classentry, 239 (unsigned) classtab[0].cclass, classtab); 240 if (!classtab) 241 abort(); 242 } 243 244 /* simply append new interval to end of interval array */ 245 classtab[0].last++; 246 classtab[classtab[0].last].first = low; 247 classtab[classtab[0].last].last = high; 248 classtab[classtab[0].last].cclass = value; 249 250 return 0; 251} 252 253void 254report_wide_char_class(void) 255{ 256 static const Classes known_classes[] = 257 {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN}; 258 int i; 259 260 printf("\n"); 261 printf("Unicode charClass data uses the last match\n"); 262 printf("from these overlapping intervals of character codes:\n"); 263 for (i = classtab[0].first; i <= classtab[0].last; i++) { 264 printf("\tU+%04X .. U+%04X %s\n", 265 (unsigned) classtab[i].first, 266 (unsigned) classtab[i].last, 267 class_name((Classes) classtab[i].cclass)); 268 } 269 printf("\n"); 270 printf("These class-names are used internally (the first character code in a class):\n"); 271 for (i = 0; i < (int) XtNumber(known_classes); ++i) { 272 printf("\t"); 273 printf(charFormat(known_classes[i]), known_classes[i]); 274 printf(" = %s\n", class_name(known_classes[i])); 275 } 276} 277#endif /* OPT_REPORT_CCLASS */ 278 279#ifdef NO_LEAKS 280void 281noleaks_CharacterClass(void) 282{ 283 FreeAndNull(classtab); 284} 285#endif 286#endif /* OPT_WIDE_CHARS */ 287 288#ifdef TEST_DRIVER 289#if OPT_WIDE_CHARS 290static void 291usage(void) 292{ 293 static const char *msg[] = 294 { 295 "Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]", 296 "", 297 "Options:", 298 " -a show all data", 299 " -s show only summary", 300 " -v verbose" 301 }; 302 size_t n; 303 for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) { 304 fprintf(stderr, "%s\n", msg[n]); 305 } 306 exit(EXIT_FAILURE); 307} 308 309static int 310expected_class(int wch) 311{ 312 int result = wch; 313 wint_t ch = (wint_t) wch; 314 if (wch < 0 || ch == '\0' || ch == '\t') { 315 result = BLANK; 316 } else if (iswcntrl(ch)) { 317 result = CNTRL; 318 } else if (iswspace(ch)) { 319 result = BLANK; 320 } else if (ch < 127) { 321 if (isalnum(ch) || ch == '_') { 322 result = ALNUM; 323 } 324 } else if (ch == 170 || ch == 181 || ch == 186) { 325 ; 326 } else if (iswalnum(ch)) { 327 result = ALNUM; 328 } 329 return result; 330} 331 332static int 333show_cclass_range(int lo, int hi) 334{ 335 int cclass = CharacterClass(lo); 336 int ident = (cclass == lo); 337 int more = 0; 338 if (ident) { 339 int ch; 340 for (ch = lo + 1; ch <= hi; ch++) { 341 if (CharacterClass(ch) != ch) { 342 ident = 0; 343 break; 344 } 345 } 346 if (ident && (hi < 255)) { 347 ch = hi + 1; 348 if (CharacterClass(ch) == ch) { 349 if (ch >= 255 || CharacterClass(ch + 1) != ch) { 350 more = 1; 351 } 352 } 353 } 354 } 355 if (!more) { 356 if (lo == hi) { 357 printf("\t%d", lo); 358 } else { 359 printf("\t%d-%d", lo, hi); 360 } 361 if (!ident) 362 printf(":%d", cclass); 363 if (hi < 255) 364 printf(", \\"); 365 printf("\n"); 366 } 367 return !more; 368} 369 370static void 371report_resource(int first, int last) 372{ 373 int class_p; 374 int ch; 375 int dh; 376 377 class_p = CharacterClass(dh = first); 378 for (ch = first; ch < last; ++ch) { 379 int class_c = CharacterClass(ch); 380 if (class_c != class_p) { 381 if (show_cclass_range(dh, ch - 1)) { 382 dh = ch; 383 class_p = class_c; 384 } 385 } 386 } 387 if (dh < last - 1) { 388 show_cclass_range(dh, last - 1); 389 } 390} 391 392static int 393decode_one(const char *source, char **target) 394{ 395 int result = -1; 396 long check; 397 int radix = 0; 398 if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') { 399 source += 2; 400 radix = 16; 401 } 402 check = strtol(source, target, radix); 403 if (*target != NULL && *target != source) 404 result = (int) check; 405 return result; 406} 407 408static int 409decode_range(const char *source, int *lo, int *hi) 410{ 411 int result = 0; 412 char *after1; 413 char *after2; 414 if ((*lo = decode_one(source, &after1)) >= 0) { 415 after1 += strspn(after1, ":-.\t "); 416 if ((*hi = decode_one(after1, &after2)) < 0) { 417 *hi = *lo; 418 } 419 result = 1; 420 } 421 return result; 422} 423 424static void 425do_range(const char *source) 426{ 427 int lo, hi; 428 if (decode_range(source, &lo, &hi)) { 429 if (opt_all) { 430 while (lo <= hi) { 431 int other_rc = CharacterClass(lo); 432 if (!opt_quiet) 433 printf("U+%04X\t%s\n", lo, class_name(other_rc)); 434 ++lo; 435 } 436 } else if (opt_check) { 437 while (lo <= hi) { 438 int expect = expected_class(lo); 439 int actual = CharacterClass(lo); 440 if (actual != expect) 441 printf("U+%04X\t%s ->%s\n", lo, 442 class_name(expect), 443 class_name(actual)); 444 ++lo; 445 } 446 } else { 447 printf("\"charClass\" resource for [%d..%d]:\n", lo, hi); 448 report_resource(lo, hi + 1); 449 } 450 } 451} 452#endif /* OPT_WIDE_CHARS */ 453 454/* 455 * TODO: add option to show do_range in hex 456 */ 457int 458main(int argc, char **argv ENVP_ARG) 459{ 460#if OPT_WIDE_CHARS 461 int ch; 462#endif 463 464 (void) argc; 465 (void) argv; 466 467#if OPT_WIDE_CHARS 468 setlocale(LC_ALL, ""); 469 while ((ch = getopt(argc, argv, "acsv")) != -1) { 470 switch (ch) { 471 case 'a': 472 opt_all = 1; 473 break; 474 case 'c': 475 opt_check = 1; 476 break; 477 case 's': 478 opt_quiet = 1; 479 break; 480 case 'v': 481 opt_v = 1; 482 break; 483 default: 484 usage(); 485 } 486 } 487 init_classtab(); 488 489 if (optind >= argc) { 490 do_range("0-255"); 491 } else { 492 while (optind < argc) { 493 do_range(argv[optind++]); 494 } 495 } 496 report_wide_char_class(); 497#else 498 printf("wide-character support is not configured\n"); 499#endif /* OPT_WIDE_CHARS */ 500 return 0; 501} 502#endif /* TEST_DRIVER */ 503