charclass.c revision 913cc679
1/* $XTermId: charclass.c,v 1.28 2017/05/29 17:43:54 tom Exp $ */
2
3/*
4 * Compact and efficient reimplementation of the
5 * xterm character class mechanism for large character sets
6 *
7 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
8 *
9 * xterm allows users to select entire words with a double-click on the left
10 * mouse button.  Opinions might differ on what type of characters are part of
11 * separate words, therefore xterm allows users to configure a class code for
12 * each 8-bit character.  Words are maximum length sequences of neighboring
13 * characters with identical class code.  Extending this mechanism to Unicode
14 * naively would create an at least 2^16 entries (128 kB) long class code
15 * table.
16 *
17 * Instead, we transform the character class table into a list of intervals,
18 * that will be accessed via a linear search.  Changes made to the table by the
19 * user will be appended.  A special class code IDENT (default) marks
20 * characters who have their code number as the class code.
21 *
22 * We could alternatively use a sorted table of non-overlapping intervals that
23 * can be accessed via binary search, but merging in new intervals is
24 * significantly more hassle and not worth the effort here.
25 */
26
27#include <xterm.h>
28#include <charclass.h>
29
30#if OPT_WIDE_CHARS
31
32static struct classentry {
33    int cclass;
34    int first;
35    int last;
36} *classtab;
37
38/*
39 * Special convention for classtab[0]:
40 * - classtab[0].cclass is the allocated number of entries in classtab
41 * - classtab[0].first = 1 (first used entry in classtab)
42 * - classtab[0].last is the last used entry in classtab
43 */
44
45int
46SetCharacterClassRange(int low, int high, int value)
47{
48    TRACE(("...SetCharacterClassRange (%#x .. %#x) = %d\n", low, high, value));
49
50    if (high < low)
51	return -1;		/* nothing to do */
52
53    /* make sure we have at least one free entry left at table end */
54    if (classtab[0].last > classtab[0].cclass - 2) {
55	classtab[0].cclass += 5 + classtab[0].cclass / 4;
56	classtab = TypeRealloc(struct classentry,
57			         (unsigned) classtab[0].cclass, classtab);
58	if (!classtab)
59	    abort();
60    }
61
62    /* simply append new interval to end of interval array */
63    classtab[0].last++;
64    classtab[classtab[0].last].first = low;
65    classtab[classtab[0].last].last = high;
66    classtab[classtab[0].last].cclass = value;
67
68    return 0;
69}
70
71typedef enum {
72    IDENT = -1,
73    ALNUM = 48,
74    CNTRL = 1,
75    BLANK = 32,
76    U_CJK = 0x4e00,
77    U_SUP = 0x2070,
78    U_SUB = 0x2080,
79    U_HIR = 0x3040,
80    U_KAT = 0x30a0,
81    U_HAN = 0xac00
82} Classes;
83
84void
85init_classtab(void)
86{
87    const int size = 50;
88
89    TRACE(("init_classtab {{\n"));
90
91    classtab = TypeMallocN(struct classentry, (unsigned) size);
92    if (!classtab)
93	abort();
94    classtab[0].cclass = size;
95    classtab[0].first = 1;
96    classtab[0].last = 0;
97
98    /* old xterm default classes */
99    SetCharacterClassRange(0, 0, BLANK);
100    SetCharacterClassRange(1, 31, CNTRL);
101    SetCharacterClassRange('\t', '\t', BLANK);
102    SetCharacterClassRange('0', '9', ALNUM);
103    SetCharacterClassRange('A', 'Z', ALNUM);
104    SetCharacterClassRange('_', '_', ALNUM);
105    SetCharacterClassRange('a', 'z', ALNUM);
106    SetCharacterClassRange(127, 159, CNTRL);
107    SetCharacterClassRange(160, 191, IDENT);
108    SetCharacterClassRange(192, 255, ALNUM);
109    SetCharacterClassRange(215, 215, IDENT);
110    SetCharacterClassRange(247, 247, IDENT);
111
112    /* added Unicode classes */
113    SetCharacterClassRange(0x0100, 0xffdf, ALNUM);	/* mostly characters */
114    SetCharacterClassRange(0x037e, 0x037e, IDENT);	/* Greek question mark */
115    SetCharacterClassRange(0x0387, 0x0387, IDENT);	/* Greek ano teleia */
116    SetCharacterClassRange(0x055a, 0x055f, IDENT);	/* Armenian punctuation */
117    SetCharacterClassRange(0x0589, 0x0589, IDENT);	/* Armenian full stop */
118    SetCharacterClassRange(0x0700, 0x070d, IDENT);	/* Syriac punctuation */
119    SetCharacterClassRange(0x104a, 0x104f, IDENT);	/* Myanmar punctuation */
120    SetCharacterClassRange(0x10fb, 0x10fb, IDENT);	/* Georgian punctuation */
121    SetCharacterClassRange(0x1361, 0x1368, IDENT);	/* Ethiopic punctuation */
122    SetCharacterClassRange(0x166d, 0x166e, IDENT);	/* Canadian Syl. punctuation */
123    SetCharacterClassRange(0x17d4, 0x17dc, IDENT);	/* Khmer punctuation */
124    SetCharacterClassRange(0x1800, 0x180a, IDENT);	/* Mongolian punctuation */
125    SetCharacterClassRange(0x2000, 0x200a, BLANK);	/* spaces */
126    SetCharacterClassRange(0x200b, 0x27ff, IDENT);	/* punctuation and symbols */
127    SetCharacterClassRange(0x2070, 0x207f, U_SUP);	/* superscript */
128    SetCharacterClassRange(0x2080, 0x208f, U_SUB);	/* subscript */
129    SetCharacterClassRange(0x3000, 0x3000, BLANK);	/* ideographic space */
130    SetCharacterClassRange(0x3001, 0x3020, IDENT);	/* ideographic punctuation */
131    SetCharacterClassRange(0x3040, 0x309f, U_HIR);	/* Hiragana */
132    SetCharacterClassRange(0x30a0, 0x30ff, U_KAT);	/* Katakana */
133    SetCharacterClassRange(0x3300, 0x9fff, U_CJK);	/* CJK Ideographs */
134    SetCharacterClassRange(0xac00, 0xd7a3, U_HAN);	/* Hangul Syllables */
135    SetCharacterClassRange(0xf900, 0xfaff, U_CJK);	/* CJK Ideographs */
136    SetCharacterClassRange(0xfe30, 0xfe6b, IDENT);	/* punctuation forms */
137    SetCharacterClassRange(0xff00, 0xff0f, IDENT);	/* half/fullwidth ASCII */
138    SetCharacterClassRange(0xff1a, 0xff20, IDENT);	/* half/fullwidth ASCII */
139    SetCharacterClassRange(0xff3b, 0xff40, IDENT);	/* half/fullwidth ASCII */
140    SetCharacterClassRange(0xff5b, 0xff64, IDENT);	/* half/fullwidth ASCII */
141
142    TRACE(("}} init_classtab\n"));
143    return;
144}
145
146int
147CharacterClass(int c)
148{
149    int i, cclass = IDENT;
150
151    for (i = classtab[0].first; i <= classtab[0].last; i++)
152	if (classtab[i].first <= c && classtab[i].last >= c)
153	    cclass = classtab[i].cclass;
154
155    if (cclass < 0)
156	cclass = c;
157
158    return cclass;
159}
160
161#if OPT_REPORT_CCLASS
162#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
163static const char *
164class_name(Classes code)
165{
166    static char buffer[80];
167    const char *result = "?";
168    switch (code) {
169    case IDENT:
170	result = "IDENT";
171	break;
172    case ALNUM:
173	result = "ALNUM";
174	break;
175    case CNTRL:
176	result = "CNTRL";
177	break;
178    case BLANK:
179	result = "BLANK";
180	break;
181    case U_SUP:
182	result = "superscript";
183	break;
184    case U_SUB:
185	result = "subscript";
186	break;
187    case U_CJK:
188	result = "CJK Ideographs";
189	break;
190    case U_HIR:
191	result = "Hiragana";
192	break;
193    case U_KAT:
194	result = "Katakana";
195	break;
196    case U_HAN:
197	result = "Hangul Syllables";
198	break;
199    default:
200	sprintf(buffer, charFormat(code), code);
201	result = buffer;
202	break;
203    }
204    return result;
205}
206
207void
208report_wide_char_class(void)
209{
210    static const Classes known_classes[] =
211    {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
212    int i;
213
214    printf("\n");
215    printf("Unicode charClass data uses the last match\n");
216    printf("from these overlapping intervals of character codes:\n");
217    for (i = classtab[0].first; i <= classtab[0].last; i++) {
218	printf("\tU+%04X .. U+%04X %s\n",
219	       classtab[i].first,
220	       classtab[i].last,
221	       class_name(classtab[i].cclass));
222    }
223    printf("\n");
224    printf("These class-names are used internally (the first character code in a class):\n");
225    for (i = 0; i < (int) XtNumber(known_classes); ++i) {
226	printf("\t");
227	printf(charFormat(known_classes[i]), known_classes[i]);
228	printf(" = %s\n", class_name(known_classes[i]));
229    }
230}
231#endif /* OPT_REPORT_CCLASS */
232
233#ifdef NO_LEAKS
234void
235noleaks_CharacterClass(void)
236{
237    if (classtab != 0) {
238	free(classtab);
239	classtab = 0;
240    }
241}
242#endif
243
244#endif /* OPT_WIDE_CHARS */
245