1/* $XTermId: charclass.c,v 1.50 2023/04/01 00:11:47 tom Exp $ */
2
3/*
4 * Copyright 2002-2022,2023 by Thomas E. Dickey
5 *
6 *                         All Rights Reserved
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Except as contained in this notice, the name(s) of the above copyright
28 * holders shall not be used in advertising or otherwise to promote the
29 * sale, use or other dealings in this Software without prior written
30 * authorization.
31 *
32 *----------------------------------------------------------------------------
33 * Compact and efficient reimplementation of the
34 * xterm character class mechanism for large character sets
35 *
36 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
37 *
38 * xterm allows users to select entire words with a double-click on the left
39 * mouse button.  Opinions might differ on what type of characters are part of
40 * separate words, therefore xterm allows users to configure a class code for
41 * each 8-bit character.  Words are maximum length sequences of neighboring
42 * characters with identical class code.  Extending this mechanism to Unicode
43 * naively would create an at least 2^16 entries (128 kB) long class code
44 * table.
45 *
46 * Instead, we transform the character class table into a list of intervals,
47 * that will be accessed via a linear search.  Changes made to the table by the
48 * user will be appended.  A special class code IDENT (default) marks
49 * characters who have their code number as the class code.
50 *
51 * We could alternatively use a sorted table of non-overlapping intervals that
52 * can be accessed via binary search, but merging in new intervals is
53 * significantly more hassle and not worth the effort here.
54 */
55
56#include <xterm.h>
57#include <charclass.h>
58
59#if OPT_WIDE_CHARS
60
61#ifdef TEST_DRIVER
62
63#include <ctype.h>
64#include <wchar.h>
65#include <wctype.h>
66
67#if OPT_TRACE
68#define Trace if (opt_v) printf
69#endif
70
71#undef OPT_REPORT_CCLASS
72#define OPT_REPORT_CCLASS 1
73#endif /* TEST_DRIVER */
74
75static struct classentry {
76    int cclass;
77    int first;
78    int last;
79} *classtab;
80
81#ifdef TEST_DRIVER
82static int opt_all;
83static int opt_check;
84static int opt_quiet;
85static int opt_v;
86#endif
87
88void
89init_classtab(void)
90{
91    const int size = 50;
92
93    TRACE(("init_classtab " TRACE_L "\n"));
94
95    classtab = TypeMallocN(struct classentry, (unsigned) size);
96    if (!classtab)
97	abort();
98    classtab[0].cclass = size;
99    classtab[0].first = 1;
100    classtab[0].last = 0;
101
102    /* old xterm default classes */
103    SetCharacterClassRange(0, 0, BLANK);
104    SetCharacterClassRange(1, 31, CNTRL);
105    SetCharacterClassRange('\t', '\t', BLANK);
106    SetCharacterClassRange('0', '9', ALNUM);
107    SetCharacterClassRange('A', 'Z', ALNUM);
108    SetCharacterClassRange('_', '_', ALNUM);
109    SetCharacterClassRange('a', 'z', ALNUM);
110    SetCharacterClassRange(127, 159, CNTRL);
111    SetCharacterClassRange(160, 191, IDENT);
112    SetCharacterClassRange(192, 255, ALNUM);
113    SetCharacterClassRange(215, 215, IDENT);
114    SetCharacterClassRange(247, 247, IDENT);
115
116    /* added Unicode classes */
117    SetCharacterClassRange(0x0100, 0xffdf, ALNUM);	/* mostly characters */
118    SetCharacterClassRange(0x037e, 0x037e, IDENT);	/* Greek question mark */
119    SetCharacterClassRange(0x0387, 0x0387, IDENT);	/* Greek ano teleia */
120    SetCharacterClassRange(0x055a, 0x055f, IDENT);	/* Armenian punctuation */
121    SetCharacterClassRange(0x0589, 0x0589, IDENT);	/* Armenian full stop */
122    SetCharacterClassRange(0x0700, 0x070d, IDENT);	/* Syriac punctuation */
123    SetCharacterClassRange(0x104a, 0x104f, IDENT);	/* Myanmar punctuation */
124    SetCharacterClassRange(0x10fb, 0x10fb, IDENT);	/* Georgian punctuation */
125    SetCharacterClassRange(0x1361, 0x1368, IDENT);	/* Ethiopic punctuation */
126    SetCharacterClassRange(0x166d, 0x166e, IDENT);	/* Canadian Syl. punctuation */
127    SetCharacterClassRange(0x17d4, 0x17dc, IDENT);	/* Khmer punctuation */
128    SetCharacterClassRange(0x1800, 0x180a, IDENT);	/* Mongolian punctuation */
129    SetCharacterClassRange(0x2000, 0x200a, BLANK);	/* spaces */
130    SetCharacterClassRange(0x200b, 0x200f, CNTRL);	/* formatting */
131    SetCharacterClassRange(0x2010, 0x27ff, IDENT);	/* punctuation and symbols */
132    SetCharacterClassRange(0x202a, 0x202e, CNTRL);	/* formatting */
133    SetCharacterClassRange(0x2060, 0x206f, CNTRL);	/* formatting */
134    SetCharacterClassRange(0x2070, 0x207f, U_SUP);	/* superscript */
135    SetCharacterClassRange(0x2080, 0x208f, U_SUB);	/* subscript */
136    SetCharacterClassRange(0x3000, 0x3000, BLANK);	/* ideographic space */
137    SetCharacterClassRange(0x3001, 0x3020, IDENT);	/* ideographic punctuation */
138    SetCharacterClassRange(0x3040, 0x309f, U_HIR);	/* Hiragana */
139    SetCharacterClassRange(0x30a0, 0x30ff, U_KAT);	/* Katakana */
140    SetCharacterClassRange(0x3300, 0x9fff, U_CJK);	/* CJK Ideographs */
141    SetCharacterClassRange(0xac00, 0xd7a3, U_HAN);	/* Hangul Syllables */
142    SetCharacterClassRange(0xf900, 0xfaff, U_CJK);	/* CJK Ideographs */
143    SetCharacterClassRange(0xfe30, 0xfe6b, IDENT);	/* punctuation forms */
144    SetCharacterClassRange(0xfeff, 0xfeff, CNTRL);	/* formatting */
145    SetCharacterClassRange(0xff00, 0xff0f, IDENT);	/* half/fullwidth ASCII */
146    SetCharacterClassRange(0xff1a, 0xff20, IDENT);	/* half/fullwidth ASCII */
147    SetCharacterClassRange(0xff3b, 0xff40, IDENT);	/* half/fullwidth ASCII */
148    SetCharacterClassRange(0xff5b, 0xff64, IDENT);	/* half/fullwidth ASCII */
149    SetCharacterClassRange(0xfff9, 0xfffb, CNTRL);	/* formatting */
150
151    TRACE((TRACE_R " init_classtab\n"));
152    return;
153}
154
155int
156CharacterClass(int c)
157{
158    int i, cclass = IDENT;
159
160    for (i = classtab[0].first; i <= classtab[0].last; i++)
161	if (classtab[i].first <= c && classtab[i].last >= c)
162	    cclass = classtab[i].cclass;
163
164    if (cclass < 0)
165	cclass = c;
166
167    return cclass;
168}
169
170#if OPT_REPORT_CCLASS
171#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
172static const char *
173class_name(Classes code)
174{
175    static char buffer[80];
176    const char *result = "?";
177    switch (code) {
178    case ALNUM:
179	result = "ALNUM";
180	break;
181    case BLANK:
182	result = "BLANK";
183	break;
184    case CNTRL:
185	result = "CNTRL";
186	break;
187    case OTHER:
188	result = "OTHER";
189	break;
190    case IDENT:
191	result = "IDENT";
192	break;
193    case U_SUP:
194	result = "superscript";
195	break;
196    case U_SUB:
197	result = "subscript";
198	break;
199    case U_CJK:
200	result = "CJK Ideographs";
201	break;
202    case U_HIR:
203	result = "Hiragana";
204	break;
205    case U_KAT:
206	result = "Katakana";
207	break;
208    case U_HAN:
209	result = "Hangul Syllables";
210	break;
211    default:
212	sprintf(buffer, charFormat(code), code);
213	result = buffer;
214	break;
215    }
216    return result;
217}
218
219/*
220 * Special convention for classtab[0]:
221 * - classtab[0].cclass is the allocated number of entries in classtab
222 * - classtab[0].first = 1 (first used entry in classtab)
223 * - classtab[0].last is the last used entry in classtab
224 */
225
226int
227SetCharacterClassRange(int low, int high, int value)
228{
229    TRACE(("...SetCharacterClassRange (U+%04X .. U+%04X) = %s\n",
230	   low, high, class_name(value)));
231
232    if (high < low)
233	return -1;		/* nothing to do */
234
235    /* make sure we have at least one free entry left at table end */
236    if (classtab[0].last > classtab[0].cclass - 2) {
237	classtab[0].cclass += 5 + classtab[0].cclass / 4;
238	classtab = TypeRealloc(struct classentry,
239			         (unsigned) classtab[0].cclass, classtab);
240	if (!classtab)
241	    abort();
242    }
243
244    /* simply append new interval to end of interval array */
245    classtab[0].last++;
246    classtab[classtab[0].last].first = low;
247    classtab[classtab[0].last].last = high;
248    classtab[classtab[0].last].cclass = value;
249
250    return 0;
251}
252
253void
254report_wide_char_class(void)
255{
256    static const Classes known_classes[] =
257    {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
258    int i;
259
260    printf("\n");
261    printf("Unicode charClass data uses the last match\n");
262    printf("from these overlapping intervals of character codes:\n");
263    for (i = classtab[0].first; i <= classtab[0].last; i++) {
264	printf("\tU+%04X .. U+%04X %s\n",
265	       (unsigned) classtab[i].first,
266	       (unsigned) classtab[i].last,
267	       class_name((Classes) classtab[i].cclass));
268    }
269    printf("\n");
270    printf("These class-names are used internally (the first character code in a class):\n");
271    for (i = 0; i < (int) XtNumber(known_classes); ++i) {
272	printf("\t");
273	printf(charFormat(known_classes[i]), known_classes[i]);
274	printf(" = %s\n", class_name(known_classes[i]));
275    }
276}
277#endif /* OPT_REPORT_CCLASS */
278
279#ifdef NO_LEAKS
280void
281noleaks_CharacterClass(void)
282{
283    FreeAndNull(classtab);
284}
285#endif
286#endif /* OPT_WIDE_CHARS */
287
288#ifdef TEST_DRIVER
289#if OPT_WIDE_CHARS
290static void
291usage(void)
292{
293    static const char *msg[] =
294    {
295	"Usage: test_charclass [options] [c1[-c1b] [c2-[c2b] [...]]]",
296	"",
297	"Options:",
298	" -a  show all data",
299	" -s  show only summary",
300	" -v  verbose"
301    };
302    size_t n;
303    for (n = 0; n < sizeof(msg) / sizeof(msg[0]); ++n) {
304	fprintf(stderr, "%s\n", msg[n]);
305    }
306    exit(EXIT_FAILURE);
307}
308
309static int
310expected_class(int wch)
311{
312    int result = wch;
313    wint_t ch = (wint_t) wch;
314    if (wch < 0 || ch == '\0' || ch == '\t') {
315	result = BLANK;
316    } else if (iswcntrl(ch)) {
317	result = CNTRL;
318    } else if (iswspace(ch)) {
319	result = BLANK;
320    } else if (ch < 127) {
321	if (isalnum(ch) || ch == '_') {
322	    result = ALNUM;
323	}
324    } else if (ch == 170 || ch == 181 || ch == 186) {
325	;
326    } else if (iswalnum(ch)) {
327	result = ALNUM;
328    }
329    return result;
330}
331
332static int
333show_cclass_range(int lo, int hi)
334{
335    int cclass = CharacterClass(lo);
336    int ident = (cclass == lo);
337    int more = 0;
338    if (ident) {
339	int ch;
340	for (ch = lo + 1; ch <= hi; ch++) {
341	    if (CharacterClass(ch) != ch) {
342		ident = 0;
343		break;
344	    }
345	}
346	if (ident && (hi < 255)) {
347	    ch = hi + 1;
348	    if (CharacterClass(ch) == ch) {
349		if (ch >= 255 || CharacterClass(ch + 1) != ch) {
350		    more = 1;
351		}
352	    }
353	}
354    }
355    if (!more) {
356	if (lo == hi) {
357	    printf("\t%d", lo);
358	} else {
359	    printf("\t%d-%d", lo, hi);
360	}
361	if (!ident)
362	    printf(":%d", cclass);
363	if (hi < 255)
364	    printf(", \\");
365	printf("\n");
366    }
367    return !more;
368}
369
370static void
371report_resource(int first, int last)
372{
373    int class_p;
374    int ch;
375    int dh;
376
377    class_p = CharacterClass(dh = first);
378    for (ch = first; ch < last; ++ch) {
379	int class_c = CharacterClass(ch);
380	if (class_c != class_p) {
381	    if (show_cclass_range(dh, ch - 1)) {
382		dh = ch;
383		class_p = class_c;
384	    }
385	}
386    }
387    if (dh < last - 1) {
388	show_cclass_range(dh, last - 1);
389    }
390}
391
392static int
393decode_one(const char *source, char **target)
394{
395    int result = -1;
396    long check;
397    int radix = 0;
398    if ((source[0] == 'u' || source[0] == 'U') && source[1] == '+') {
399	source += 2;
400	radix = 16;
401    }
402    check = strtol(source, target, radix);
403    if (*target != NULL && *target != source)
404	result = (int) check;
405    return result;
406}
407
408static int
409decode_range(const char *source, int *lo, int *hi)
410{
411    int result = 0;
412    char *after1;
413    char *after2;
414    if ((*lo = decode_one(source, &after1)) >= 0) {
415	after1 += strspn(after1, ":-.\t ");
416	if ((*hi = decode_one(after1, &after2)) < 0) {
417	    *hi = *lo;
418	}
419	result = 1;
420    }
421    return result;
422}
423
424static void
425do_range(const char *source)
426{
427    int lo, hi;
428    if (decode_range(source, &lo, &hi)) {
429	if (opt_all) {
430	    while (lo <= hi) {
431		int other_rc = CharacterClass(lo);
432		if (!opt_quiet)
433		    printf("U+%04X\t%s\n", lo, class_name(other_rc));
434		++lo;
435	    }
436	} else if (opt_check) {
437	    while (lo <= hi) {
438		int expect = expected_class(lo);
439		int actual = CharacterClass(lo);
440		if (actual != expect)
441		    printf("U+%04X\t%s ->%s\n", lo,
442			   class_name(expect),
443			   class_name(actual));
444		++lo;
445	    }
446	} else {
447	    printf("\"charClass\" resource for [%d..%d]:\n", lo, hi);
448	    report_resource(lo, hi + 1);
449	}
450    }
451}
452#endif /* OPT_WIDE_CHARS */
453
454/*
455 * TODO: add option to show do_range in hex
456 */
457int
458main(int argc, char **argv ENVP_ARG)
459{
460#if OPT_WIDE_CHARS
461    int ch;
462#endif
463
464    (void) argc;
465    (void) argv;
466
467#if OPT_WIDE_CHARS
468    setlocale(LC_ALL, "");
469    while ((ch = getopt(argc, argv, "acsv")) != -1) {
470	switch (ch) {
471	case 'a':
472	    opt_all = 1;
473	    break;
474	case 'c':
475	    opt_check = 1;
476	    break;
477	case 's':
478	    opt_quiet = 1;
479	    break;
480	case 'v':
481	    opt_v = 1;
482	    break;
483	default:
484	    usage();
485	}
486    }
487    init_classtab();
488
489    if (optind >= argc) {
490	do_range("0-255");
491    } else {
492	while (optind < argc) {
493	    do_range(argv[optind++]);
494	}
495    }
496    report_wide_char_class();
497#else
498    printf("wide-character support is not configured\n");
499#endif /* OPT_WIDE_CHARS */
500    return 0;
501}
502#endif /* TEST_DRIVER */
503