charclass.c revision 913cc679
1/* $XTermId: charclass.c,v 1.28 2017/05/29 17:43:54 tom Exp $ */ 2 3/* 4 * Compact and efficient reimplementation of the 5 * xterm character class mechanism for large character sets 6 * 7 * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 8 * 9 * xterm allows users to select entire words with a double-click on the left 10 * mouse button. Opinions might differ on what type of characters are part of 11 * separate words, therefore xterm allows users to configure a class code for 12 * each 8-bit character. Words are maximum length sequences of neighboring 13 * characters with identical class code. Extending this mechanism to Unicode 14 * naively would create an at least 2^16 entries (128 kB) long class code 15 * table. 16 * 17 * Instead, we transform the character class table into a list of intervals, 18 * that will be accessed via a linear search. Changes made to the table by the 19 * user will be appended. A special class code IDENT (default) marks 20 * characters who have their code number as the class code. 21 * 22 * We could alternatively use a sorted table of non-overlapping intervals that 23 * can be accessed via binary search, but merging in new intervals is 24 * significantly more hassle and not worth the effort here. 25 */ 26 27#include <xterm.h> 28#include <charclass.h> 29 30#if OPT_WIDE_CHARS 31 32static struct classentry { 33 int cclass; 34 int first; 35 int last; 36} *classtab; 37 38/* 39 * Special convention for classtab[0]: 40 * - classtab[0].cclass is the allocated number of entries in classtab 41 * - classtab[0].first = 1 (first used entry in classtab) 42 * - classtab[0].last is the last used entry in classtab 43 */ 44 45int 46SetCharacterClassRange(int low, int high, int value) 47{ 48 TRACE(("...SetCharacterClassRange (%#x .. %#x) = %d\n", low, high, value)); 49 50 if (high < low) 51 return -1; /* nothing to do */ 52 53 /* make sure we have at least one free entry left at table end */ 54 if (classtab[0].last > classtab[0].cclass - 2) { 55 classtab[0].cclass += 5 + classtab[0].cclass / 4; 56 classtab = TypeRealloc(struct classentry, 57 (unsigned) classtab[0].cclass, classtab); 58 if (!classtab) 59 abort(); 60 } 61 62 /* simply append new interval to end of interval array */ 63 classtab[0].last++; 64 classtab[classtab[0].last].first = low; 65 classtab[classtab[0].last].last = high; 66 classtab[classtab[0].last].cclass = value; 67 68 return 0; 69} 70 71typedef enum { 72 IDENT = -1, 73 ALNUM = 48, 74 CNTRL = 1, 75 BLANK = 32, 76 U_CJK = 0x4e00, 77 U_SUP = 0x2070, 78 U_SUB = 0x2080, 79 U_HIR = 0x3040, 80 U_KAT = 0x30a0, 81 U_HAN = 0xac00 82} Classes; 83 84void 85init_classtab(void) 86{ 87 const int size = 50; 88 89 TRACE(("init_classtab {{\n")); 90 91 classtab = TypeMallocN(struct classentry, (unsigned) size); 92 if (!classtab) 93 abort(); 94 classtab[0].cclass = size; 95 classtab[0].first = 1; 96 classtab[0].last = 0; 97 98 /* old xterm default classes */ 99 SetCharacterClassRange(0, 0, BLANK); 100 SetCharacterClassRange(1, 31, CNTRL); 101 SetCharacterClassRange('\t', '\t', BLANK); 102 SetCharacterClassRange('0', '9', ALNUM); 103 SetCharacterClassRange('A', 'Z', ALNUM); 104 SetCharacterClassRange('_', '_', ALNUM); 105 SetCharacterClassRange('a', 'z', ALNUM); 106 SetCharacterClassRange(127, 159, CNTRL); 107 SetCharacterClassRange(160, 191, IDENT); 108 SetCharacterClassRange(192, 255, ALNUM); 109 SetCharacterClassRange(215, 215, IDENT); 110 SetCharacterClassRange(247, 247, IDENT); 111 112 /* added Unicode classes */ 113 SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */ 114 SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */ 115 SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */ 116 SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */ 117 SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */ 118 SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */ 119 SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */ 120 SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */ 121 SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */ 122 SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */ 123 SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */ 124 SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */ 125 SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */ 126 SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */ 127 SetCharacterClassRange(0x2070, 0x207f, U_SUP); /* superscript */ 128 SetCharacterClassRange(0x2080, 0x208f, U_SUB); /* subscript */ 129 SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */ 130 SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */ 131 SetCharacterClassRange(0x3040, 0x309f, U_HIR); /* Hiragana */ 132 SetCharacterClassRange(0x30a0, 0x30ff, U_KAT); /* Katakana */ 133 SetCharacterClassRange(0x3300, 0x9fff, U_CJK); /* CJK Ideographs */ 134 SetCharacterClassRange(0xac00, 0xd7a3, U_HAN); /* Hangul Syllables */ 135 SetCharacterClassRange(0xf900, 0xfaff, U_CJK); /* CJK Ideographs */ 136 SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */ 137 SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */ 138 SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */ 139 SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */ 140 SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */ 141 142 TRACE(("}} init_classtab\n")); 143 return; 144} 145 146int 147CharacterClass(int c) 148{ 149 int i, cclass = IDENT; 150 151 for (i = classtab[0].first; i <= classtab[0].last; i++) 152 if (classtab[i].first <= c && classtab[i].last >= c) 153 cclass = classtab[i].cclass; 154 155 if (cclass < 0) 156 cclass = c; 157 158 return cclass; 159} 160 161#if OPT_REPORT_CCLASS 162#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d") 163static const char * 164class_name(Classes code) 165{ 166 static char buffer[80]; 167 const char *result = "?"; 168 switch (code) { 169 case IDENT: 170 result = "IDENT"; 171 break; 172 case ALNUM: 173 result = "ALNUM"; 174 break; 175 case CNTRL: 176 result = "CNTRL"; 177 break; 178 case BLANK: 179 result = "BLANK"; 180 break; 181 case U_SUP: 182 result = "superscript"; 183 break; 184 case U_SUB: 185 result = "subscript"; 186 break; 187 case U_CJK: 188 result = "CJK Ideographs"; 189 break; 190 case U_HIR: 191 result = "Hiragana"; 192 break; 193 case U_KAT: 194 result = "Katakana"; 195 break; 196 case U_HAN: 197 result = "Hangul Syllables"; 198 break; 199 default: 200 sprintf(buffer, charFormat(code), code); 201 result = buffer; 202 break; 203 } 204 return result; 205} 206 207void 208report_wide_char_class(void) 209{ 210 static const Classes known_classes[] = 211 {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN}; 212 int i; 213 214 printf("\n"); 215 printf("Unicode charClass data uses the last match\n"); 216 printf("from these overlapping intervals of character codes:\n"); 217 for (i = classtab[0].first; i <= classtab[0].last; i++) { 218 printf("\tU+%04X .. U+%04X %s\n", 219 classtab[i].first, 220 classtab[i].last, 221 class_name(classtab[i].cclass)); 222 } 223 printf("\n"); 224 printf("These class-names are used internally (the first character code in a class):\n"); 225 for (i = 0; i < (int) XtNumber(known_classes); ++i) { 226 printf("\t"); 227 printf(charFormat(known_classes[i]), known_classes[i]); 228 printf(" = %s\n", class_name(known_classes[i])); 229 } 230} 231#endif /* OPT_REPORT_CCLASS */ 232 233#ifdef NO_LEAKS 234void 235noleaks_CharacterClass(void) 236{ 237 if (classtab != 0) { 238 free(classtab); 239 classtab = 0; 240 } 241} 242#endif 243 244#endif /* OPT_WIDE_CHARS */ 245