charclass.c revision 913cc679
1913cc679Smrg/* $XTermId: charclass.c,v 1.28 2017/05/29 17:43:54 tom Exp $ */
2d522f475Smrg
3d522f475Smrg/*
4d522f475Smrg * Compact and efficient reimplementation of the
5d522f475Smrg * xterm character class mechanism for large character sets
6d522f475Smrg *
7d522f475Smrg * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
8d522f475Smrg *
9894e0ac8Smrg * xterm allows users to select entire words with a double-click on the left
10d522f475Smrg * mouse button.  Opinions might differ on what type of characters are part of
11d522f475Smrg * separate words, therefore xterm allows users to configure a class code for
12d522f475Smrg * each 8-bit character.  Words are maximum length sequences of neighboring
13d522f475Smrg * characters with identical class code.  Extending this mechanism to Unicode
14d522f475Smrg * naively would create an at least 2^16 entries (128 kB) long class code
15d522f475Smrg * table.
16d522f475Smrg *
17d522f475Smrg * Instead, we transform the character class table into a list of intervals,
18d522f475Smrg * that will be accessed via a linear search.  Changes made to the table by the
19d522f475Smrg * user will be appended.  A special class code IDENT (default) marks
20d522f475Smrg * characters who have their code number as the class code.
21d522f475Smrg *
22d522f475Smrg * We could alternatively use a sorted table of non-overlapping intervals that
23d522f475Smrg * can be accessed via binary search, but merging in new intervals is
24d522f475Smrg * significantly more hassle and not worth the effort here.
25d522f475Smrg */
26d522f475Smrg
27d522f475Smrg#include <xterm.h>
28d522f475Smrg#include <charclass.h>
29d522f475Smrg
30d522f475Smrg#if OPT_WIDE_CHARS
31d522f475Smrg
32d522f475Smrgstatic struct classentry {
33d522f475Smrg    int cclass;
34d522f475Smrg    int first;
35d522f475Smrg    int last;
36d522f475Smrg} *classtab;
37d522f475Smrg
38d522f475Smrg/*
39d522f475Smrg * Special convention for classtab[0]:
40d522f475Smrg * - classtab[0].cclass is the allocated number of entries in classtab
41d522f475Smrg * - classtab[0].first = 1 (first used entry in classtab)
42d522f475Smrg * - classtab[0].last is the last used entry in classtab
43d522f475Smrg */
44d522f475Smrg
45d522f475Smrgint
46d522f475SmrgSetCharacterClassRange(int low, int high, int value)
47d522f475Smrg{
48913cc679Smrg    TRACE(("...SetCharacterClassRange (%#x .. %#x) = %d\n", low, high, value));
49913cc679Smrg
50d522f475Smrg    if (high < low)
51d522f475Smrg	return -1;		/* nothing to do */
52d522f475Smrg
53d522f475Smrg    /* make sure we have at least one free entry left at table end */
54d522f475Smrg    if (classtab[0].last > classtab[0].cclass - 2) {
55d522f475Smrg	classtab[0].cclass += 5 + classtab[0].cclass / 4;
5620d2c4d2Smrg	classtab = TypeRealloc(struct classentry,
5720d2c4d2Smrg			         (unsigned) classtab[0].cclass, classtab);
58d522f475Smrg	if (!classtab)
59d522f475Smrg	    abort();
60d522f475Smrg    }
61d522f475Smrg
62d522f475Smrg    /* simply append new interval to end of interval array */
63d522f475Smrg    classtab[0].last++;
64d522f475Smrg    classtab[classtab[0].last].first = low;
65d522f475Smrg    classtab[classtab[0].last].last = high;
66d522f475Smrg    classtab[classtab[0].last].cclass = value;
67d522f475Smrg
68d522f475Smrg    return 0;
69d522f475Smrg}
70d522f475Smrg
71d522f475Smrgtypedef enum {
72d522f475Smrg    IDENT = -1,
73d522f475Smrg    ALNUM = 48,
74d522f475Smrg    CNTRL = 1,
75913cc679Smrg    BLANK = 32,
76913cc679Smrg    U_CJK = 0x4e00,
77913cc679Smrg    U_SUP = 0x2070,
78913cc679Smrg    U_SUB = 0x2080,
79913cc679Smrg    U_HIR = 0x3040,
80913cc679Smrg    U_KAT = 0x30a0,
81913cc679Smrg    U_HAN = 0xac00
82d522f475Smrg} Classes;
83d522f475Smrg
84d522f475Smrgvoid
85d522f475Smrginit_classtab(void)
86d522f475Smrg{
87d522f475Smrg    const int size = 50;
88d522f475Smrg
89913cc679Smrg    TRACE(("init_classtab {{\n"));
90913cc679Smrg
9120d2c4d2Smrg    classtab = TypeMallocN(struct classentry, (unsigned) size);
92d522f475Smrg    if (!classtab)
93d522f475Smrg	abort();
94d522f475Smrg    classtab[0].cclass = size;
95d522f475Smrg    classtab[0].first = 1;
96d522f475Smrg    classtab[0].last = 0;
97d522f475Smrg
98d522f475Smrg    /* old xterm default classes */
99d522f475Smrg    SetCharacterClassRange(0, 0, BLANK);
100d522f475Smrg    SetCharacterClassRange(1, 31, CNTRL);
101d522f475Smrg    SetCharacterClassRange('\t', '\t', BLANK);
102d522f475Smrg    SetCharacterClassRange('0', '9', ALNUM);
103d522f475Smrg    SetCharacterClassRange('A', 'Z', ALNUM);
104d522f475Smrg    SetCharacterClassRange('_', '_', ALNUM);
105d522f475Smrg    SetCharacterClassRange('a', 'z', ALNUM);
106d522f475Smrg    SetCharacterClassRange(127, 159, CNTRL);
107d522f475Smrg    SetCharacterClassRange(160, 191, IDENT);
108d522f475Smrg    SetCharacterClassRange(192, 255, ALNUM);
109d522f475Smrg    SetCharacterClassRange(215, 215, IDENT);
110d522f475Smrg    SetCharacterClassRange(247, 247, IDENT);
111d522f475Smrg
112d522f475Smrg    /* added Unicode classes */
113d522f475Smrg    SetCharacterClassRange(0x0100, 0xffdf, ALNUM);	/* mostly characters */
114d522f475Smrg    SetCharacterClassRange(0x037e, 0x037e, IDENT);	/* Greek question mark */
115d522f475Smrg    SetCharacterClassRange(0x0387, 0x0387, IDENT);	/* Greek ano teleia */
116d522f475Smrg    SetCharacterClassRange(0x055a, 0x055f, IDENT);	/* Armenian punctuation */
117d522f475Smrg    SetCharacterClassRange(0x0589, 0x0589, IDENT);	/* Armenian full stop */
118d522f475Smrg    SetCharacterClassRange(0x0700, 0x070d, IDENT);	/* Syriac punctuation */
119d522f475Smrg    SetCharacterClassRange(0x104a, 0x104f, IDENT);	/* Myanmar punctuation */
120d522f475Smrg    SetCharacterClassRange(0x10fb, 0x10fb, IDENT);	/* Georgian punctuation */
121d522f475Smrg    SetCharacterClassRange(0x1361, 0x1368, IDENT);	/* Ethiopic punctuation */
122d522f475Smrg    SetCharacterClassRange(0x166d, 0x166e, IDENT);	/* Canadian Syl. punctuation */
123d522f475Smrg    SetCharacterClassRange(0x17d4, 0x17dc, IDENT);	/* Khmer punctuation */
124d522f475Smrg    SetCharacterClassRange(0x1800, 0x180a, IDENT);	/* Mongolian punctuation */
125d522f475Smrg    SetCharacterClassRange(0x2000, 0x200a, BLANK);	/* spaces */
126d522f475Smrg    SetCharacterClassRange(0x200b, 0x27ff, IDENT);	/* punctuation and symbols */
127913cc679Smrg    SetCharacterClassRange(0x2070, 0x207f, U_SUP);	/* superscript */
128913cc679Smrg    SetCharacterClassRange(0x2080, 0x208f, U_SUB);	/* subscript */
129d522f475Smrg    SetCharacterClassRange(0x3000, 0x3000, BLANK);	/* ideographic space */
130d522f475Smrg    SetCharacterClassRange(0x3001, 0x3020, IDENT);	/* ideographic punctuation */
131913cc679Smrg    SetCharacterClassRange(0x3040, 0x309f, U_HIR);	/* Hiragana */
132913cc679Smrg    SetCharacterClassRange(0x30a0, 0x30ff, U_KAT);	/* Katakana */
133913cc679Smrg    SetCharacterClassRange(0x3300, 0x9fff, U_CJK);	/* CJK Ideographs */
134913cc679Smrg    SetCharacterClassRange(0xac00, 0xd7a3, U_HAN);	/* Hangul Syllables */
135913cc679Smrg    SetCharacterClassRange(0xf900, 0xfaff, U_CJK);	/* CJK Ideographs */
136d522f475Smrg    SetCharacterClassRange(0xfe30, 0xfe6b, IDENT);	/* punctuation forms */
137d522f475Smrg    SetCharacterClassRange(0xff00, 0xff0f, IDENT);	/* half/fullwidth ASCII */
138d522f475Smrg    SetCharacterClassRange(0xff1a, 0xff20, IDENT);	/* half/fullwidth ASCII */
139d522f475Smrg    SetCharacterClassRange(0xff3b, 0xff40, IDENT);	/* half/fullwidth ASCII */
140d522f475Smrg    SetCharacterClassRange(0xff5b, 0xff64, IDENT);	/* half/fullwidth ASCII */
141d522f475Smrg
142913cc679Smrg    TRACE(("}} init_classtab\n"));
143d522f475Smrg    return;
144d522f475Smrg}
145d522f475Smrg
146d522f475Smrgint
147d522f475SmrgCharacterClass(int c)
148d522f475Smrg{
149d522f475Smrg    int i, cclass = IDENT;
150d522f475Smrg
151d522f475Smrg    for (i = classtab[0].first; i <= classtab[0].last; i++)
152d522f475Smrg	if (classtab[i].first <= c && classtab[i].last >= c)
153d522f475Smrg	    cclass = classtab[i].cclass;
154d522f475Smrg
155d522f475Smrg    if (cclass < 0)
156d522f475Smrg	cclass = c;
157d522f475Smrg
158d522f475Smrg    return cclass;
159d522f475Smrg}
160d522f475Smrg
161913cc679Smrg#if OPT_REPORT_CCLASS
162913cc679Smrg#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d")
163913cc679Smrgstatic const char *
164913cc679Smrgclass_name(Classes code)
165913cc679Smrg{
166913cc679Smrg    static char buffer[80];
167913cc679Smrg    const char *result = "?";
168913cc679Smrg    switch (code) {
169913cc679Smrg    case IDENT:
170913cc679Smrg	result = "IDENT";
171913cc679Smrg	break;
172913cc679Smrg    case ALNUM:
173913cc679Smrg	result = "ALNUM";
174913cc679Smrg	break;
175913cc679Smrg    case CNTRL:
176913cc679Smrg	result = "CNTRL";
177913cc679Smrg	break;
178913cc679Smrg    case BLANK:
179913cc679Smrg	result = "BLANK";
180913cc679Smrg	break;
181913cc679Smrg    case U_SUP:
182913cc679Smrg	result = "superscript";
183913cc679Smrg	break;
184913cc679Smrg    case U_SUB:
185913cc679Smrg	result = "subscript";
186913cc679Smrg	break;
187913cc679Smrg    case U_CJK:
188913cc679Smrg	result = "CJK Ideographs";
189913cc679Smrg	break;
190913cc679Smrg    case U_HIR:
191913cc679Smrg	result = "Hiragana";
192913cc679Smrg	break;
193913cc679Smrg    case U_KAT:
194913cc679Smrg	result = "Katakana";
195913cc679Smrg	break;
196913cc679Smrg    case U_HAN:
197913cc679Smrg	result = "Hangul Syllables";
198913cc679Smrg	break;
199913cc679Smrg    default:
200913cc679Smrg	sprintf(buffer, charFormat(code), code);
201913cc679Smrg	result = buffer;
202913cc679Smrg	break;
203913cc679Smrg    }
204913cc679Smrg    return result;
205913cc679Smrg}
206913cc679Smrg
207913cc679Smrgvoid
208913cc679Smrgreport_wide_char_class(void)
209913cc679Smrg{
210913cc679Smrg    static const Classes known_classes[] =
211913cc679Smrg    {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN};
212913cc679Smrg    int i;
213913cc679Smrg
214913cc679Smrg    printf("\n");
215913cc679Smrg    printf("Unicode charClass data uses the last match\n");
216913cc679Smrg    printf("from these overlapping intervals of character codes:\n");
217913cc679Smrg    for (i = classtab[0].first; i <= classtab[0].last; i++) {
218913cc679Smrg	printf("\tU+%04X .. U+%04X %s\n",
219913cc679Smrg	       classtab[i].first,
220913cc679Smrg	       classtab[i].last,
221913cc679Smrg	       class_name(classtab[i].cclass));
222913cc679Smrg    }
223913cc679Smrg    printf("\n");
224913cc679Smrg    printf("These class-names are used internally (the first character code in a class):\n");
225913cc679Smrg    for (i = 0; i < (int) XtNumber(known_classes); ++i) {
226913cc679Smrg	printf("\t");
227913cc679Smrg	printf(charFormat(known_classes[i]), known_classes[i]);
228913cc679Smrg	printf(" = %s\n", class_name(known_classes[i]));
229913cc679Smrg    }
230913cc679Smrg}
231913cc679Smrg#endif /* OPT_REPORT_CCLASS */
232913cc679Smrg
233d522f475Smrg#ifdef NO_LEAKS
234d522f475Smrgvoid
235d522f475Smrgnoleaks_CharacterClass(void)
236d522f475Smrg{
237d522f475Smrg    if (classtab != 0) {
238d522f475Smrg	free(classtab);
239d522f475Smrg	classtab = 0;
240d522f475Smrg    }
241d522f475Smrg}
242d522f475Smrg#endif
243d522f475Smrg
244d522f475Smrg#endif /* OPT_WIDE_CHARS */
245