charset.c revision a8fdb4bc
1/* 2Copyright (c) 2001 by Juliusz Chroboczek 3 4Permission is hereby granted, free of charge, to any person obtaining a copy 5of this software and associated documentation files (the "Software"), to deal 6in the Software without restriction, including without limitation the rights 7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8copies of the Software, and to permit persons to whom the Software is 9furnished to do so, subject to the following conditions: 10 11The above copyright notice and this permission notice shall be included in 12all copies or substantial portions of the Software. 13 14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20THE SOFTWARE. 21*/ 22/* $XFree86: xc/programs/luit/charset.c,v 1.8 2003/12/22 17:48:12 tsi Exp $ */ 23 24#include <stdlib.h> 25#include <stdio.h> 26#include <string.h> 27#include <ctype.h> 28#include <X11/fonts/fontenc.h> 29#include "other.h" 30#include "charset.h" 31#include "parser.h" 32 33#ifndef NULL 34#define NULL 0 35#endif 36 37static unsigned int 38IdentityRecode(unsigned int n, CharsetPtr self) 39{ 40 return n; 41} 42 43#ifdef UNUSED 44static int 45IdentityReverse(unsigned int n, CharsetPtr self) 46{ 47#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 48 switch(self->type) { 49 case T_94: 50 case T_96: 51 if (IS_GL(n)) return n; else return -1; 52 case T_128: 53 if (n < 0x80) return n; else return -1; 54 case T_9494: 55 case T_9696: 56 if(IS_GL(n>>8) && IS_GL(n&0xFF)) 57 return n; 58 else 59 return -1; 60 case T_94192: 61 if(IS_GL(n>>8) && IS_GL(n&0x7F)) 62 return n; 63 else 64 return -1; 65 default: 66 abort(); 67 } 68#undef IS_GL 69} 70#endif 71 72static int 73NullReverse(unsigned int n, CharsetPtr self) 74{ 75 return -1; 76} 77 78CharsetRec Unknown94Charset = 79{ "Unknown (94)", T_94, 0, IdentityRecode, NullReverse, NULL, NULL}; 80CharsetRec Unknown96Charset = 81{ "Unknown (96)", T_96, 0, IdentityRecode, NullReverse, NULL, NULL}; 82CharsetRec Unknown9494Charset = 83{ "Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, NULL, NULL}; 84CharsetRec Unknown9696Charset = 85{ "Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, NULL, NULL}; 86 87typedef struct _FontencCharset { 88 char *name; 89 int type; 90 unsigned char final; 91 char *xlfd; 92 int shift; 93 FontMapPtr mapping; 94 FontMapReversePtr reverse; 95} FontencCharsetRec, *FontencCharsetPtr; 96 97FontencCharsetRec fontencCharsets[] = { 98 {"ISO 646 (1973)", T_94, '@', "iso646.1973-0", 0x00, NULL, NULL}, 99 {"ASCII", T_94, 'B', "iso8859-1", 0x00, NULL, NULL}, 100 {"JIS X 0201:GL", T_94, 'J', "jisx0201.1976-0", 0x00, NULL, NULL}, 101 {"JIS X 0201:GR", T_94, 'I', "jisx0201.1976-0", 0x80, NULL, NULL}, 102 {"DEC Special", T_94, '0', "dec-special", 0x00, NULL, NULL}, 103 {"DEC Technical", T_94, '>', "dec-dectech", 0x00, NULL, NULL}, 104 105 {"ISO 8859-1", T_96, 'A', "iso8859-1", 0x80, NULL, NULL}, 106 {"ISO 8859-2", T_96, 'B', "iso8859-2", 0x80, NULL, NULL}, 107 {"ISO 8859-3", T_96, 'C', "iso8859-3", 0x80, NULL, NULL}, 108 {"ISO 8859-4", T_96, 'D', "iso8859-4", 0x80, NULL, NULL}, 109 {"ISO 8859-5", T_96, 'L', "iso8859-5", 0x80, NULL, NULL}, 110 {"ISO 8859-6", T_96, 'G', "iso8859-6", 0x80, NULL, NULL}, 111 {"ISO 8859-7", T_96, 'F', "iso8859-7", 0x80, NULL, NULL}, 112 {"ISO 8859-8", T_96, 'H', "iso8859-8", 0x80, NULL, NULL}, 113 {"ISO 8859-9", T_96, 'M', "iso8859-9", 0x80, NULL, NULL}, 114 {"ISO 8859-10", T_96, 'V', "iso8859-10", 0x80, NULL, NULL}, 115 {"ISO 8859-11", T_96, 'T', "iso8859-11", 0x80, NULL, NULL}, 116 {"TIS 620", T_96, 'T', "iso8859-11", 0x80, NULL, NULL}, 117 {"ISO 8859-13", T_96, 'Y', "iso8859-13", 0x80, NULL, NULL}, 118 {"ISO 8859-14", T_96, '_', "iso8859-14", 0x80, NULL, NULL}, 119 {"ISO 8859-15", T_96, 'b', "iso8859-15", 0x80, NULL, NULL}, 120 {"ISO 8859-16", T_96, 'f', "iso8859-16", 0x80, NULL, NULL}, 121 {"KOI8-E", T_96, '@', "koi8-e", 0x80, NULL, NULL}, 122 {"TCVN", T_96, 'Z', "tcvn-0", 0x80, NULL, NULL}, 123 124 {"GB 2312", T_9494, 'A', "gb2312.1980-0", 0x0000, NULL, NULL}, 125 {"JIS X 0208", T_9494, 'B', "jisx0208.1990-0", 0x0000, NULL, NULL}, 126 {"KSC 5601", T_9494, 'C', "ksc5601.1987-0", 0x0000, NULL, NULL}, 127 {"JIS X 0212", T_9494, 'D', "jisx0212.1990-0", 0x0000, NULL, NULL}, 128 129 {"GB 2312", T_9696, 'A', "gb2312.1980-0", 0x0000, NULL, NULL}, 130 {"JIS X 0208", T_9696, 'B', "jisx0208.1990-0", 0x0000, NULL, NULL}, 131 {"KSC 5601", T_9696, 'C', "ksc5601.1987-0", 0x0000, NULL, NULL}, 132 {"JIS X 0212", T_9696, 'D', "jisx0212.1990-0", 0x0000, NULL, NULL}, 133 134 {"KOI8-R", T_128, 0, "koi8-r", 0x80, NULL, NULL}, 135 {"KOI8-U", T_128, 0, "koi8-u", 0x80, NULL, NULL}, 136 {"KOI8-RU", T_128, 0, "koi8-ru", 0x80, NULL, NULL}, 137 {"CP 1252", T_128, 0, "microsoft-cp1252", 0x80, NULL, NULL}, 138 {"CP 1251", T_128, 0, "microsoft-cp1251", 0x80, NULL, NULL}, 139 {"CP 1250", T_128, 0, "microsoft-cp1250", 0x80, NULL, NULL}, 140 141 {"CP 437", T_128, 0, "ibm-cp437", 0x80, NULL, NULL}, 142 {"CP 850", T_128, 0, "ibm-cp850", 0x80, NULL, NULL}, 143 {"CP 866", T_128, 0, "ibm-cp866", 0x80, NULL, NULL}, 144 145 {"Big 5", T_94192, 0, "big5.eten-0", 0x8000, NULL, NULL}, 146 {NULL, 0, 0, NULL, 0, NULL, NULL} 147}; 148 149typedef struct _OtherCharset { 150 char *name; 151 int (*init)(OtherStatePtr); 152 unsigned int (*mapping)(unsigned int, OtherStatePtr); 153 unsigned int (*reverse)(unsigned int, OtherStatePtr); 154 int (*stack)(unsigned char, OtherStatePtr); 155} OtherCharsetRec, *OtherCharsetPtr; 156 157OtherCharsetRec otherCharsets[] = { 158 {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk}, 159 {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8}, 160 {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis}, 161 {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs}, 162 {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030}, 163 {NULL, NULL, NULL, NULL, NULL} 164}; 165 166static int 167compare(const char *s, const char *t) 168{ 169 while(*s || *t) { 170 if(*s && (isspace(*s) || *s == '-' || *s == '_')) 171 s++; 172 else if(*t && (isspace(*t) || *t == '-' || *t == '_')) 173 t++; 174 else if(*s && *t && tolower(*s) == tolower(*t)) { 175 s++; 176 t++; 177 } else 178 return 1; 179 } 180 return 0; 181} 182 183static unsigned int 184FontencCharsetRecode(unsigned int n, CharsetPtr self) 185{ 186 FontencCharsetPtr fc = (FontencCharsetPtr)(self->data); 187 188 return FontEncRecode(n + fc->shift, fc->mapping); 189} 190 191static int 192FontencCharsetReverse(unsigned int i, CharsetPtr self) 193{ 194 FontencCharsetPtr fc = (FontencCharsetPtr)(self->data); 195 int n; 196 197 n = fc->reverse->reverse(i, fc->reverse->data); 198 if(n == 0 || n < fc->shift) 199 return -1; 200 else 201 n -= fc->shift; 202 203#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 204 switch(self->type) { 205 case T_94: case T_96: 206 if (IS_GL(n)) return n; else return -1; 207 break; 208 case T_128: 209 if (n < 0x80) return n; else return -1; 210 case T_9494: case T_9696: 211 if(IS_GL(n>>8) && IS_GL(n&0xFF)) 212 return n; 213 else 214 return -1; 215 break; 216 case T_94192: 217 if(IS_GL(n>>8) && IS_GL(n&0x7F)) 218 return n; 219 else 220 return -1; 221 break; 222 default: 223 abort(); 224 } 225#undef IS_GL 226} 227 228 229static CharsetPtr cachedCharsets = NULL; 230 231static CharsetPtr 232getCachedCharset(unsigned char final, int type, const char *name) 233{ 234 CharsetPtr c; 235 for(c = cachedCharsets; c; c = c->next) { 236 if(((c->type == type && c->final == final) || 237 (name && !compare(c->name, name))) && 238 (c->type != T_FAILED)) 239 return c; 240 } 241 return NULL; 242} 243 244static void 245cacheCharset(CharsetPtr c) { 246 c->next = cachedCharsets; 247 cachedCharsets = c; 248} 249 250static CharsetPtr 251getFontencCharset(unsigned char final, int type, const char *name) 252{ 253 FontencCharsetPtr fc; 254 CharsetPtr c; 255 FontMapPtr mapping; 256 FontMapReversePtr reverse; 257 258 fc = fontencCharsets; 259 while(fc->name) { 260 if(((fc->type == type && fc->final == final) || 261 (name && !compare(fc->name, name))) && 262 (fc->type != T_FAILED)) 263 break; 264 fc++; 265 } 266 267 if(!fc->name) 268 return NULL; 269 270 c = malloc(sizeof(CharsetRec)); 271 if(c == NULL) 272 return NULL; 273 274 mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL); 275 if(!mapping) { 276 fc->type = T_FAILED; 277 return NULL; 278 } 279 280 reverse = FontMapReverse(mapping); 281 if(!reverse) { 282 fc->type = T_FAILED; 283 return NULL; 284 } 285 286 fc->mapping = mapping; 287 fc->reverse = reverse; 288 289 c->name = fc->name; 290 c->type = fc->type; 291 c->final = fc->final; 292 c->recode = FontencCharsetRecode; 293 c->reverse = FontencCharsetReverse; 294 c->data = fc; 295 296 cacheCharset(c); 297 return c; 298} 299 300static CharsetPtr 301getOtherCharset(const char *name) 302{ 303 OtherCharsetPtr fc; 304 CharsetPtr c; 305 OtherStatePtr s; 306 307 fc = otherCharsets; 308 while(fc->name) { 309 if(name && !compare(fc->name, name)) 310 break; 311 fc++; 312 } 313 314 if(!fc->name) 315 return NULL; 316 317 c = malloc(sizeof(CharsetRec)); 318 if(c == NULL) 319 return NULL; 320 321 s = malloc(sizeof(OtherState)); 322 if(s == NULL) { 323 free(c); 324 return NULL; 325 } 326 327 c->name = fc->name; 328 c->type = T_OTHER; 329 c->final = 0; 330 c->data = fc; 331 c->other_recode = fc->mapping; 332 c->other_reverse = fc->reverse; 333 c->other_stack = fc->stack; 334 c->other_aux = s; 335 336 if(!fc->init(s)) { 337 c->type = T_FAILED; 338 return NULL; 339 } 340 341 cacheCharset(c); 342 return c; 343} 344 345CharsetPtr 346getUnknownCharset(int type) 347{ 348 switch(type) { 349 case T_94: return &Unknown94Charset; 350 case T_96: return &Unknown96Charset; 351 case T_9494: return &Unknown9494Charset; 352 case T_9696: return &Unknown9696Charset; 353 default: return &Unknown94Charset; 354 } 355} 356 357CharsetPtr 358getCharset(unsigned char final, int type) 359{ 360 CharsetPtr c; 361 362 c = getCachedCharset(final, type, NULL); 363 if(c) 364 return c; 365 366 c = getFontencCharset(final, type, NULL); 367 if(c) 368 return c; 369 370 return getUnknownCharset(type); 371} 372 373CharsetPtr 374getCharsetByName(const char *name) 375{ 376 CharsetPtr c; 377 378 if(name == NULL) 379 return getUnknownCharset(T_94); 380 381 c = getCachedCharset(0, 0, name); 382 if(c) 383 return c; 384 385 c = getFontencCharset(0, 0, name); 386 if(c) 387 return c; 388 389 c = getOtherCharset(name); 390 if(c) 391 return c; 392 393 return getUnknownCharset(T_94); 394} 395 396const LocaleCharsetRec localeCharsets[] = { 397 { "C", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 398 { "POSIX", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 399 { "ISO8859-1", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 400 { "ISO8859-2", 0, 2, "ASCII", NULL, "ISO 8859-2", NULL, NULL}, 401 { "ISO8859-3", 0, 2, "ASCII", NULL, "ISO 8859-3", NULL, NULL}, 402 { "ISO8859-4", 0, 2, "ASCII", NULL, "ISO 8859-4", NULL, NULL}, 403 { "ISO8859-5", 0, 2, "ASCII", NULL, "ISO 8859-5", NULL, NULL}, 404 { "ISO8859-6", 0, 2, "ASCII", NULL, "ISO 8859-6", NULL, NULL}, 405 { "ISO8859-7", 0, 2, "ASCII", NULL, "ISO 8859-7", NULL, NULL}, 406 { "ISO8859-8", 0, 2, "ASCII", NULL, "ISO 8859-8", NULL, NULL}, 407 { "ISO8859-9", 0, 2, "ASCII", NULL, "ISO 8859-9", NULL, NULL}, 408 { "ISO8859-10", 0, 2, "ASCII", NULL, "ISO 8859-10", NULL, NULL}, 409 { "ISO8859-11", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 410 { "TIS620", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 411 { "ISO8859-13", 0, 2, "ASCII", NULL, "ISO 8859-13", NULL, NULL}, 412 { "ISO8859-14", 0, 2, "ASCII", NULL, "ISO 8859-14", NULL, NULL}, 413 { "ISO8859-15", 0, 2, "ASCII", NULL, "ISO 8859-15", NULL, NULL}, 414 { "ISO8859-16", 0, 2, "ASCII", NULL, "ISO 8859-16", NULL, NULL}, 415 { "KOI8-R", 0, 2, "ASCII", NULL, "KOI8-R", NULL, NULL}, 416 { "CP1251", 0, 2, "ASCII", NULL, "CP 1251", NULL, NULL}, 417 { "TCVN", 0, 2, "ASCII", NULL, "TCVN", NULL, NULL}, 418 { "eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 419 { "GB2312", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 420 { "eucJP", 0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL}, 421 { "eucKR", 0, 1, "ASCII", "KSC 5601", NULL, NULL, NULL}, 422 { "eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 423 { "Big5", 0, 1, "ASCII", "Big 5", NULL, NULL, NULL}, 424 { "gbk", 0, 1, NULL, NULL, NULL, NULL, "GBK"}, 425 { "UTF-8", 0, 1, NULL, NULL, NULL, NULL, "UTF-8"}, 426 { "SJIS", 0, 1, NULL, NULL, NULL, NULL, "SJIS"}, 427 { "Big5-HKSCS", 0, 1, NULL, NULL, NULL, NULL, "BIG5-HKSCS"}, 428 { "gb18030", 0, 1, NULL, NULL, NULL, NULL, "GB18030"}, 429 { NULL, 0, 0, NULL, NULL, NULL, NULL, NULL} 430}; 431 432void 433reportCharsets(void) 434{ 435 const LocaleCharsetRec *p; 436 FontencCharsetPtr q; 437 printf("Known locale encodings:\n\n"); 438 for(p = localeCharsets; p->name; p++) { 439 if(p->other) { 440 printf(" %s (non-ISO-2022 encoding)\n", p->other); 441 continue; 442 } 443 printf(" %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr); 444 if(p->g0) printf(", G0: %s", p->g0); 445 if(p->g1) printf(", G1: %s", p->g1); 446 if(p->g2) printf(", G2: %s", p->g2); 447 if(p->g3) printf(", G3: %s", p->g3); 448 printf("\n"); 449 } 450 451 printf("\n\nKnown charsets (not all may be available):\n\n"); 452 for(q = fontencCharsets; q->name; q++) 453 printf(" %s%s\n", 454 q->name, q->final?" (ISO 2022)":""); 455} 456 457int 458getLocaleState(const char *locale, char *charset, 459 int *gl_return, int *gr_return, 460 CharsetPtr *g0_return, CharsetPtr *g1_return, 461 CharsetPtr *g2_return, CharsetPtr *g3_return, 462 CharsetPtr *other_return) 463{ 464 char *resolved = NULL; 465 const LocaleCharsetRec *p; 466 467 if(!charset) { 468 resolved = resolveLocale(locale); 469 if(!resolved) 470 return -1; 471 charset = strrchr(resolved, '.'); 472 if(charset) 473 charset++; 474 else 475 charset = resolved; 476 } 477 478 for(p = localeCharsets; p->name; p++) { 479 if(compare(p->name, charset) == 0) 480 break; 481 } 482 483 if(p->name == NULL) { 484 if (resolved != 0) 485 free(resolved); 486 return -1; 487 } 488 489 *gl_return = p->gl; 490 *gr_return = p->gr; 491 *g0_return = getCharsetByName(p->g0); 492 *g1_return = getCharsetByName(p->g1); 493 *g2_return = getCharsetByName(p->g2); 494 *g3_return = getCharsetByName(p->g3); 495 if(p->other) 496 *other_return = getCharsetByName(p->other); 497 else 498 *other_return = NULL; 499 return 0; 500} 501 502