charclass.c revision 913cc679
1913cc679Smrg/* $XTermId: charclass.c,v 1.28 2017/05/29 17:43:54 tom Exp $ */ 2d522f475Smrg 3d522f475Smrg/* 4d522f475Smrg * Compact and efficient reimplementation of the 5d522f475Smrg * xterm character class mechanism for large character sets 6d522f475Smrg * 7d522f475Smrg * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 8d522f475Smrg * 9894e0ac8Smrg * xterm allows users to select entire words with a double-click on the left 10d522f475Smrg * mouse button. Opinions might differ on what type of characters are part of 11d522f475Smrg * separate words, therefore xterm allows users to configure a class code for 12d522f475Smrg * each 8-bit character. Words are maximum length sequences of neighboring 13d522f475Smrg * characters with identical class code. Extending this mechanism to Unicode 14d522f475Smrg * naively would create an at least 2^16 entries (128 kB) long class code 15d522f475Smrg * table. 16d522f475Smrg * 17d522f475Smrg * Instead, we transform the character class table into a list of intervals, 18d522f475Smrg * that will be accessed via a linear search. Changes made to the table by the 19d522f475Smrg * user will be appended. A special class code IDENT (default) marks 20d522f475Smrg * characters who have their code number as the class code. 21d522f475Smrg * 22d522f475Smrg * We could alternatively use a sorted table of non-overlapping intervals that 23d522f475Smrg * can be accessed via binary search, but merging in new intervals is 24d522f475Smrg * significantly more hassle and not worth the effort here. 25d522f475Smrg */ 26d522f475Smrg 27d522f475Smrg#include <xterm.h> 28d522f475Smrg#include <charclass.h> 29d522f475Smrg 30d522f475Smrg#if OPT_WIDE_CHARS 31d522f475Smrg 32d522f475Smrgstatic struct classentry { 33d522f475Smrg int cclass; 34d522f475Smrg int first; 35d522f475Smrg int last; 36d522f475Smrg} *classtab; 37d522f475Smrg 38d522f475Smrg/* 39d522f475Smrg * Special convention for classtab[0]: 40d522f475Smrg * - classtab[0].cclass is the allocated number of entries in classtab 41d522f475Smrg * - classtab[0].first = 1 (first used entry in classtab) 42d522f475Smrg * - classtab[0].last is the last used entry in classtab 43d522f475Smrg */ 44d522f475Smrg 45d522f475Smrgint 46d522f475SmrgSetCharacterClassRange(int low, int high, int value) 47d522f475Smrg{ 48913cc679Smrg TRACE(("...SetCharacterClassRange (%#x .. %#x) = %d\n", low, high, value)); 49913cc679Smrg 50d522f475Smrg if (high < low) 51d522f475Smrg return -1; /* nothing to do */ 52d522f475Smrg 53d522f475Smrg /* make sure we have at least one free entry left at table end */ 54d522f475Smrg if (classtab[0].last > classtab[0].cclass - 2) { 55d522f475Smrg classtab[0].cclass += 5 + classtab[0].cclass / 4; 5620d2c4d2Smrg classtab = TypeRealloc(struct classentry, 5720d2c4d2Smrg (unsigned) classtab[0].cclass, classtab); 58d522f475Smrg if (!classtab) 59d522f475Smrg abort(); 60d522f475Smrg } 61d522f475Smrg 62d522f475Smrg /* simply append new interval to end of interval array */ 63d522f475Smrg classtab[0].last++; 64d522f475Smrg classtab[classtab[0].last].first = low; 65d522f475Smrg classtab[classtab[0].last].last = high; 66d522f475Smrg classtab[classtab[0].last].cclass = value; 67d522f475Smrg 68d522f475Smrg return 0; 69d522f475Smrg} 70d522f475Smrg 71d522f475Smrgtypedef enum { 72d522f475Smrg IDENT = -1, 73d522f475Smrg ALNUM = 48, 74d522f475Smrg CNTRL = 1, 75913cc679Smrg BLANK = 32, 76913cc679Smrg U_CJK = 0x4e00, 77913cc679Smrg U_SUP = 0x2070, 78913cc679Smrg U_SUB = 0x2080, 79913cc679Smrg U_HIR = 0x3040, 80913cc679Smrg U_KAT = 0x30a0, 81913cc679Smrg U_HAN = 0xac00 82d522f475Smrg} Classes; 83d522f475Smrg 84d522f475Smrgvoid 85d522f475Smrginit_classtab(void) 86d522f475Smrg{ 87d522f475Smrg const int size = 50; 88d522f475Smrg 89913cc679Smrg TRACE(("init_classtab {{\n")); 90913cc679Smrg 9120d2c4d2Smrg classtab = TypeMallocN(struct classentry, (unsigned) size); 92d522f475Smrg if (!classtab) 93d522f475Smrg abort(); 94d522f475Smrg classtab[0].cclass = size; 95d522f475Smrg classtab[0].first = 1; 96d522f475Smrg classtab[0].last = 0; 97d522f475Smrg 98d522f475Smrg /* old xterm default classes */ 99d522f475Smrg SetCharacterClassRange(0, 0, BLANK); 100d522f475Smrg SetCharacterClassRange(1, 31, CNTRL); 101d522f475Smrg SetCharacterClassRange('\t', '\t', BLANK); 102d522f475Smrg SetCharacterClassRange('0', '9', ALNUM); 103d522f475Smrg SetCharacterClassRange('A', 'Z', ALNUM); 104d522f475Smrg SetCharacterClassRange('_', '_', ALNUM); 105d522f475Smrg SetCharacterClassRange('a', 'z', ALNUM); 106d522f475Smrg SetCharacterClassRange(127, 159, CNTRL); 107d522f475Smrg SetCharacterClassRange(160, 191, IDENT); 108d522f475Smrg SetCharacterClassRange(192, 255, ALNUM); 109d522f475Smrg SetCharacterClassRange(215, 215, IDENT); 110d522f475Smrg SetCharacterClassRange(247, 247, IDENT); 111d522f475Smrg 112d522f475Smrg /* added Unicode classes */ 113d522f475Smrg SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */ 114d522f475Smrg SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */ 115d522f475Smrg SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */ 116d522f475Smrg SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */ 117d522f475Smrg SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */ 118d522f475Smrg SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */ 119d522f475Smrg SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */ 120d522f475Smrg SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */ 121d522f475Smrg SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */ 122d522f475Smrg SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */ 123d522f475Smrg SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */ 124d522f475Smrg SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */ 125d522f475Smrg SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */ 126d522f475Smrg SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */ 127913cc679Smrg SetCharacterClassRange(0x2070, 0x207f, U_SUP); /* superscript */ 128913cc679Smrg SetCharacterClassRange(0x2080, 0x208f, U_SUB); /* subscript */ 129d522f475Smrg SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */ 130d522f475Smrg SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */ 131913cc679Smrg SetCharacterClassRange(0x3040, 0x309f, U_HIR); /* Hiragana */ 132913cc679Smrg SetCharacterClassRange(0x30a0, 0x30ff, U_KAT); /* Katakana */ 133913cc679Smrg SetCharacterClassRange(0x3300, 0x9fff, U_CJK); /* CJK Ideographs */ 134913cc679Smrg SetCharacterClassRange(0xac00, 0xd7a3, U_HAN); /* Hangul Syllables */ 135913cc679Smrg SetCharacterClassRange(0xf900, 0xfaff, U_CJK); /* CJK Ideographs */ 136d522f475Smrg SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */ 137d522f475Smrg SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */ 138d522f475Smrg SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */ 139d522f475Smrg SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */ 140d522f475Smrg SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */ 141d522f475Smrg 142913cc679Smrg TRACE(("}} init_classtab\n")); 143d522f475Smrg return; 144d522f475Smrg} 145d522f475Smrg 146d522f475Smrgint 147d522f475SmrgCharacterClass(int c) 148d522f475Smrg{ 149d522f475Smrg int i, cclass = IDENT; 150d522f475Smrg 151d522f475Smrg for (i = classtab[0].first; i <= classtab[0].last; i++) 152d522f475Smrg if (classtab[i].first <= c && classtab[i].last >= c) 153d522f475Smrg cclass = classtab[i].cclass; 154d522f475Smrg 155d522f475Smrg if (cclass < 0) 156d522f475Smrg cclass = c; 157d522f475Smrg 158d522f475Smrg return cclass; 159d522f475Smrg} 160d522f475Smrg 161913cc679Smrg#if OPT_REPORT_CCLASS 162913cc679Smrg#define charFormat(code) ((code) > 255 ? "0x%04X" : "%d") 163913cc679Smrgstatic const char * 164913cc679Smrgclass_name(Classes code) 165913cc679Smrg{ 166913cc679Smrg static char buffer[80]; 167913cc679Smrg const char *result = "?"; 168913cc679Smrg switch (code) { 169913cc679Smrg case IDENT: 170913cc679Smrg result = "IDENT"; 171913cc679Smrg break; 172913cc679Smrg case ALNUM: 173913cc679Smrg result = "ALNUM"; 174913cc679Smrg break; 175913cc679Smrg case CNTRL: 176913cc679Smrg result = "CNTRL"; 177913cc679Smrg break; 178913cc679Smrg case BLANK: 179913cc679Smrg result = "BLANK"; 180913cc679Smrg break; 181913cc679Smrg case U_SUP: 182913cc679Smrg result = "superscript"; 183913cc679Smrg break; 184913cc679Smrg case U_SUB: 185913cc679Smrg result = "subscript"; 186913cc679Smrg break; 187913cc679Smrg case U_CJK: 188913cc679Smrg result = "CJK Ideographs"; 189913cc679Smrg break; 190913cc679Smrg case U_HIR: 191913cc679Smrg result = "Hiragana"; 192913cc679Smrg break; 193913cc679Smrg case U_KAT: 194913cc679Smrg result = "Katakana"; 195913cc679Smrg break; 196913cc679Smrg case U_HAN: 197913cc679Smrg result = "Hangul Syllables"; 198913cc679Smrg break; 199913cc679Smrg default: 200913cc679Smrg sprintf(buffer, charFormat(code), code); 201913cc679Smrg result = buffer; 202913cc679Smrg break; 203913cc679Smrg } 204913cc679Smrg return result; 205913cc679Smrg} 206913cc679Smrg 207913cc679Smrgvoid 208913cc679Smrgreport_wide_char_class(void) 209913cc679Smrg{ 210913cc679Smrg static const Classes known_classes[] = 211913cc679Smrg {IDENT, ALNUM, CNTRL, BLANK, U_SUP, U_SUB, U_HIR, U_KAT, U_CJK, U_HAN}; 212913cc679Smrg int i; 213913cc679Smrg 214913cc679Smrg printf("\n"); 215913cc679Smrg printf("Unicode charClass data uses the last match\n"); 216913cc679Smrg printf("from these overlapping intervals of character codes:\n"); 217913cc679Smrg for (i = classtab[0].first; i <= classtab[0].last; i++) { 218913cc679Smrg printf("\tU+%04X .. U+%04X %s\n", 219913cc679Smrg classtab[i].first, 220913cc679Smrg classtab[i].last, 221913cc679Smrg class_name(classtab[i].cclass)); 222913cc679Smrg } 223913cc679Smrg printf("\n"); 224913cc679Smrg printf("These class-names are used internally (the first character code in a class):\n"); 225913cc679Smrg for (i = 0; i < (int) XtNumber(known_classes); ++i) { 226913cc679Smrg printf("\t"); 227913cc679Smrg printf(charFormat(known_classes[i]), known_classes[i]); 228913cc679Smrg printf(" = %s\n", class_name(known_classes[i])); 229913cc679Smrg } 230913cc679Smrg} 231913cc679Smrg#endif /* OPT_REPORT_CCLASS */ 232913cc679Smrg 233d522f475Smrg#ifdef NO_LEAKS 234d522f475Smrgvoid 235d522f475Smrgnoleaks_CharacterClass(void) 236d522f475Smrg{ 237d522f475Smrg if (classtab != 0) { 238d522f475Smrg free(classtab); 239d522f475Smrg classtab = 0; 240d522f475Smrg } 241d522f475Smrg} 242d522f475Smrg#endif 243d522f475Smrg 244d522f475Smrg#endif /* OPT_WIDE_CHARS */ 245