charset.c revision 77683534
1/* 2Copyright (c) 2001 by Juliusz Chroboczek 3 4Permission is hereby granted, free of charge, to any person obtaining a copy 5of this software and associated documentation files (the "Software"), to deal 6in the Software without restriction, including without limitation the rights 7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8copies of the Software, and to permit persons to whom the Software is 9furnished to do so, subject to the following conditions: 10 11The above copyright notice and this permission notice shall be included in 12all copies or substantial portions of the Software. 13 14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20THE SOFTWARE. 21*/ 22 23#ifdef HAVE_CONFIG_H 24# include "config.h" 25#endif 26 27#include <stdlib.h> 28#include <stdio.h> 29#include <string.h> 30#include <ctype.h> 31 32#include "sys.h" 33#include "other.h" 34#include "charset.h" 35#include "parser.h" 36 37#ifndef NULL 38#define NULL 0 39#endif 40 41static unsigned int 42IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED) 43{ 44 return n; 45} 46 47#ifdef UNUSED 48static int 49IdentityReverse(unsigned int n, const CharsetRec * self) 50{ 51#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 52 switch (self->type) { 53 case T_94: 54 case T_96: 55 if (IS_GL(n)) 56 return n; 57 else 58 return -1; 59 case T_128: 60 if (n < 0x80) 61 return n; 62 else 63 return -1; 64 case T_9494: 65 case T_9696: 66 if (IS_GL(n >> 8) && IS_GL(n & 0xFF)) 67 return n; 68 else 69 return -1; 70 case T_94192: 71 if (IS_GL(n >> 8) && IS_GL(n & 0x7F)) 72 return n; 73 else 74 return -1; 75 default: 76 abort(); 77 /* NOTREACHED */ 78 } 79#undef IS_GL 80} 81#endif 82 83static int 84NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED) 85{ 86 return -1; 87} 88 89static const CharsetRec Unknown94Charset = 90{"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 91static const CharsetRec Unknown96Charset = 92{"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 93static const CharsetRec Unknown9494Charset = 94{"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 95static const CharsetRec Unknown9696Charset = 96{"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 97 98typedef struct _FontencCharset { 99 const char *name; 100 int type; 101 unsigned char final; 102 const char *xlfd; 103 unsigned shift; 104 FontMapPtr mapping; 105 FontMapReversePtr reverse; 106} FontencCharsetRec, *FontencCharsetPtr; 107/* *INDENT-OFF* */ 108static FontencCharsetRec fontencCharsets[] = 109{ 110 {"ISO 646 (1973)", T_94, '@', "iso646.1973-0", 0x00, 0, 0}, 111 {"ASCII", T_94, 'B', "iso8859-1", 0x00, 0, 0}, 112 {"JIS X 0201:GL", T_94, 'J', "jisx0201.1976-0", 0x00, 0, 0}, 113 {"JIS X 0201:GR", T_94, 'I', "jisx0201.1976-0", 0x80, 0, 0}, 114 {"DEC Special", T_94, '0', "dec-special", 0x00, 0, 0}, 115 {"DEC Technical", T_94, '>', "dec-dectech", 0x00, 0, 0}, 116 117 {"ISO 8859-1", T_96, 'A', "iso8859-1", 0x80, 0, 0}, 118 {"ISO 8859-2", T_96, 'B', "iso8859-2", 0x80, 0, 0}, 119 {"ISO 8859-3", T_96, 'C', "iso8859-3", 0x80, 0, 0}, 120 {"ISO 8859-4", T_96, 'D', "iso8859-4", 0x80, 0, 0}, 121 {"ISO 8859-5", T_96, 'L', "iso8859-5", 0x80, 0, 0}, 122 {"ISO 8859-6", T_96, 'G', "iso8859-6", 0x80, 0, 0}, 123 {"ISO 8859-7", T_96, 'F', "iso8859-7", 0x80, 0, 0}, 124 {"ISO 8859-8", T_96, 'H', "iso8859-8", 0x80, 0, 0}, 125 {"ISO 8859-9", T_96, 'M', "iso8859-9", 0x80, 0, 0}, 126 {"ISO 8859-10", T_96, 'V', "iso8859-10", 0x80, 0, 0}, 127 {"ISO 8859-11", T_96, 'T', "iso8859-11", 0x80, 0, 0}, 128 {"TIS 620", T_96, 'T', "iso8859-11", 0x80, 0, 0}, 129 {"ISO 8859-13", T_96, 'Y', "iso8859-13", 0x80, 0, 0}, 130 {"ISO 8859-14", T_96, '_', "iso8859-14", 0x80, 0, 0}, 131 {"ISO 8859-15", T_96, 'b', "iso8859-15", 0x80, 0, 0}, 132 {"ISO 8859-16", T_96, 'f', "iso8859-16", 0x80, 0, 0}, 133 {"KOI8-E", T_96, '@', "koi8-e", 0x80, 0, 0}, 134 {"TCVN", T_96, 'Z', "tcvn-0", 0x80, 0, 0}, 135 136 {"GB 2312", T_9494, 'A', "gb2312.1980-0", 0x0000, 0, 0}, 137 {"JIS X 0208", T_9494, 'B', "jisx0208.1990-0", 0x0000, 0, 0}, 138 {"KSC 5601", T_9494, 'C', "ksc5601.1987-0", 0x0000, 0, 0}, 139 {"JIS X 0212", T_9494, 'D', "jisx0212.1990-0", 0x0000, 0, 0}, 140 141 {"GB 2312", T_9696, 'A', "gb2312.1980-0", 0x0000, 0, 0}, 142 {"JIS X 0208", T_9696, 'B', "jisx0208.1990-0", 0x0000, 0, 0}, 143 {"KSC 5601", T_9696, 'C', "ksc5601.1987-0", 0x0000, 0, 0}, 144 {"JIS X 0212", T_9696, 'D', "jisx0212.1990-0", 0x0000, 0, 0}, 145 146 {"KOI8-R", T_128, 0, "koi8-r", 0x80, 0, 0}, 147 {"KOI8-U", T_128, 0, "koi8-u", 0x80, 0, 0}, 148 {"KOI8-RU", T_128, 0, "koi8-ru", 0x80, 0, 0}, 149 {"CP 1252", T_128, 0, "microsoft-cp1252", 0x80, 0, 0}, 150 {"CP 1251", T_128, 0, "microsoft-cp1251", 0x80, 0, 0}, 151 {"CP 1250", T_128, 0, "microsoft-cp1250", 0x80, 0, 0}, 152 153 {"CP 437", T_128, 0, "ibm-cp437", 0x80, 0, 0}, 154 {"CP 850", T_128, 0, "ibm-cp850", 0x80, 0, 0}, 155 {"CP 866", T_128, 0, "ibm-cp866", 0x80, 0, 0}, 156 157 {"Big 5", T_94192, 0, "big5.eten-0", 0x8000, 0, 0}, 158 {0, 0, 0, 0, 0, 0, 0} 159}; 160/* *INDENT-ON* */ 161 162typedef struct _OtherCharset { 163 const char *name; 164 int (*init) (OtherStatePtr); 165 unsigned int (*mapping) (unsigned int, OtherStatePtr); 166 unsigned int (*reverse) (unsigned int, OtherStatePtr); 167 int (*stack) (unsigned, OtherStatePtr); 168} OtherCharsetRec, *OtherCharsetPtr; 169 170static const OtherCharsetRec otherCharsets[] = 171{ 172 {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk}, 173 {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8}, 174 {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis}, 175 {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs}, 176 {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030}, 177 {0, 0, 0, 0, 0} 178}; 179 180static int 181compare(const char *s, const char *t) 182{ 183 while (*s || *t) { 184 if (*s && (isspace(UChar(*s)) || *s == '-' || *s == '_')) 185 s++; 186 else if (*t && (isspace(UChar(*t)) || *t == '-' || *t == '_')) 187 t++; 188 else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) { 189 s++; 190 t++; 191 } else 192 return 1; 193 } 194 return 0; 195} 196 197static unsigned int 198FontencCharsetRecode(unsigned int n, const CharsetRec * self) 199{ 200 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data); 201 202 return FontEncRecode(n + fc->shift, fc->mapping); 203} 204 205static int 206FontencCharsetReverse(unsigned int i, const CharsetRec * self) 207{ 208 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data); 209 unsigned n; 210 211 n = fc->reverse->reverse(i, fc->reverse->data); 212 if (n == 0 || n < fc->shift) 213 return -1; 214 else 215 n -= fc->shift; 216 217#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 218 switch (self->type) { 219 case T_94: 220 case T_96: 221 if (IS_GL(n)) 222 return (int) n; 223 else 224 return -1; 225 case T_128: 226 if (n < 0x80) 227 return (int) n; 228 else 229 return -1; 230 case T_9494: 231 case T_9696: 232 if (IS_GL(n >> 8) && IS_GL(n & 0xFF)) 233 return (int) n; 234 else 235 return -1; 236 case T_94192: 237 if (IS_GL(n >> 8) && IS_GL(n & 0x7F)) 238 return (int) n; 239 else 240 return -1; 241 default: 242 abort(); 243 /* NOTREACHED */ 244 } 245#undef IS_GL 246} 247 248static CharsetPtr cachedCharsets = NULL; 249 250static CharsetPtr 251getCachedCharset(unsigned final, int type, const char *name) 252{ 253 CharsetPtr c; 254 for (c = cachedCharsets; c; c = c->next) { 255 if (((c->type == type && c->final == final) || 256 (name && !compare(c->name, name))) && 257 (c->type != T_FAILED)) 258 return c; 259 } 260 return NULL; 261} 262 263static void 264cacheCharset(CharsetPtr c) 265{ 266 c->next = cachedCharsets; 267 cachedCharsets = c; 268} 269 270static CharsetPtr 271getFontencCharset(unsigned final, int type, const char *name) 272{ 273 FontencCharsetPtr fc; 274 CharsetPtr c; 275 FontMapPtr mapping; 276 FontMapReversePtr reverse; 277 278 fc = fontencCharsets; 279 while (fc->name) { 280 if (((fc->type == type && fc->final == final) || 281 (name && !compare(fc->name, name))) && 282 (fc->type != T_FAILED)) 283 break; 284 fc++; 285 } 286 287 if (!fc->name) 288 return NULL; 289 290 c = malloc(sizeof(CharsetRec)); 291 if (c == NULL) 292 return NULL; 293 294 mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL); 295 if (!mapping) { 296 fc->type = T_FAILED; 297 return NULL; 298 } 299 300 reverse = FontMapReverse(mapping); 301 if (!reverse) { 302 fc->type = T_FAILED; 303 return NULL; 304 } 305 306 fc->mapping = mapping; 307 fc->reverse = reverse; 308 309 c->name = fc->name; 310 c->type = fc->type; 311 c->final = fc->final; 312 c->recode = FontencCharsetRecode; 313 c->reverse = FontencCharsetReverse; 314 c->data = fc; 315 316 cacheCharset(c); 317 return c; 318} 319 320static CharsetPtr 321getOtherCharset(const char *name) 322{ 323 const OtherCharsetRec *fc; 324 CharsetPtr c; 325 OtherStatePtr s; 326 327 fc = otherCharsets; 328 while (fc->name) { 329 if (name && !compare(fc->name, name)) 330 break; 331 fc++; 332 } 333 334 if (!fc->name) 335 return NULL; 336 337 c = malloc(sizeof(CharsetRec)); 338 if (c == NULL) 339 return NULL; 340 341 s = malloc(sizeof(OtherState)); 342 if (s == NULL) { 343 free(c); 344 return NULL; 345 } 346 347 c->name = fc->name; 348 c->type = T_OTHER; 349 c->final = 0; 350 c->data = fc; 351 c->other_recode = fc->mapping; 352 c->other_reverse = fc->reverse; 353 c->other_stack = fc->stack; 354 c->other_aux = s; 355 356 if (!fc->init(s)) { 357 c->type = T_FAILED; 358 return NULL; 359 } 360 361 cacheCharset(c); 362 return c; 363} 364 365const CharsetRec * 366getUnknownCharset(int type) 367{ 368 switch (type) { 369 case T_94: 370 return &Unknown94Charset; 371 case T_96: 372 return &Unknown96Charset; 373 case T_9494: 374 return &Unknown9494Charset; 375 case T_9696: 376 return &Unknown9696Charset; 377 default: 378 return &Unknown94Charset; 379 } 380} 381 382const CharsetRec * 383getCharset(unsigned final, int type) 384{ 385 const CharsetRec *c; 386 387 c = getCachedCharset(final, type, NULL); 388 if (c) 389 return c; 390 391 c = getFontencCharset(final, type, NULL); 392 if (c) 393 return c; 394 395 return getUnknownCharset(type); 396} 397 398const CharsetRec * 399getCharsetByName(const char *name) 400{ 401 const CharsetRec *c; 402 403 if (name == NULL) 404 return getUnknownCharset(T_94); 405 406 c = getCachedCharset(0, 0, name); 407 if (c) 408 return c; 409 410 c = getFontencCharset(0, 0, name); 411 if (c) 412 return c; 413 414 c = getOtherCharset(name); 415 if (c) 416 return c; 417 418 return getUnknownCharset(T_94); 419} 420/* *INDENT-OFF* */ 421static const LocaleCharsetRec localeCharsets[] = 422{ 423 {"C", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 424 {"POSIX", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 425 {"ISO8859-1", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 426 {"ISO8859-2", 0, 2, "ASCII", NULL, "ISO 8859-2", NULL, NULL}, 427 {"ISO8859-3", 0, 2, "ASCII", NULL, "ISO 8859-3", NULL, NULL}, 428 {"ISO8859-4", 0, 2, "ASCII", NULL, "ISO 8859-4", NULL, NULL}, 429 {"ISO8859-5", 0, 2, "ASCII", NULL, "ISO 8859-5", NULL, NULL}, 430 {"ISO8859-6", 0, 2, "ASCII", NULL, "ISO 8859-6", NULL, NULL}, 431 {"ISO8859-7", 0, 2, "ASCII", NULL, "ISO 8859-7", NULL, NULL}, 432 {"ISO8859-8", 0, 2, "ASCII", NULL, "ISO 8859-8", NULL, NULL}, 433 {"ISO8859-9", 0, 2, "ASCII", NULL, "ISO 8859-9", NULL, NULL}, 434 {"ISO8859-10", 0, 2, "ASCII", NULL, "ISO 8859-10", NULL, NULL}, 435 {"ISO8859-11", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 436 {"TIS620", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 437 {"ISO8859-13", 0, 2, "ASCII", NULL, "ISO 8859-13", NULL, NULL}, 438 {"ISO8859-14", 0, 2, "ASCII", NULL, "ISO 8859-14", NULL, NULL}, 439 {"ISO8859-15", 0, 2, "ASCII", NULL, "ISO 8859-15", NULL, NULL}, 440 {"ISO8859-16", 0, 2, "ASCII", NULL, "ISO 8859-16", NULL, NULL}, 441 {"KOI8-R", 0, 2, "ASCII", NULL, "KOI8-R", NULL, NULL}, 442 {"CP1251", 0, 2, "ASCII", NULL, "CP 1251", NULL, NULL}, 443 {"TCVN", 0, 2, "ASCII", NULL, "TCVN", NULL, NULL}, 444 {"eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 445 {"GB2312", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 446 {"eucJP", 0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL}, 447 {"eucKR", 0, 1, "ASCII", "KSC 5601", NULL, NULL, NULL}, 448 {"eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 449 {"Big5", 0, 1, "ASCII", "Big 5", NULL, NULL, NULL}, 450 {"gbk", 0, 1, NULL, NULL, NULL, NULL, "GBK"}, 451 {"UTF-8", 0, 1, NULL, NULL, NULL, NULL, "UTF-8"}, 452 {"SJIS", 0, 1, NULL, NULL, NULL, NULL, "SJIS"}, 453 {"Big5-HKSCS", 0, 1, NULL, NULL, NULL, NULL, "BIG5-HKSCS"}, 454 {"gb18030", 0, 1, NULL, NULL, NULL, NULL, "GB18030"}, 455 {0, 0, 0, 0, 0, 0, 0, 0} 456}; 457/* *INDENT-ON* */ 458 459void 460reportCharsets(void) 461{ 462 const LocaleCharsetRec *p; 463 FontencCharsetPtr q; 464 printf("Known locale encodings:\n\n"); 465 for (p = localeCharsets; p->name; p++) { 466 if (p->other) { 467 printf(" %s (non-ISO-2022 encoding)\n", p->other); 468 continue; 469 } 470 printf(" %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr); 471 if (p->g0) 472 printf(", G0: %s", p->g0); 473 if (p->g1) 474 printf(", G1: %s", p->g1); 475 if (p->g2) 476 printf(", G2: %s", p->g2); 477 if (p->g3) 478 printf(", G3: %s", p->g3); 479 printf("\n"); 480 } 481 482 printf("\n\nKnown charsets (not all may be available):\n\n"); 483 for (q = fontencCharsets; q->name; q++) 484 printf(" %s%s\n", 485 q->name, q->final ? " (ISO 2022)" : ""); 486} 487 488int 489getLocaleState(const char *locale, 490 const char *charset, 491 int *gl_return, int *gr_return, 492 const CharsetRec * *g0_return, 493 const CharsetRec * *g1_return, 494 const CharsetRec * *g2_return, 495 const CharsetRec * *g3_return, 496 const CharsetRec * *other_return) 497{ 498 int result = 0; 499 char *resolved = 0; 500 const LocaleCharsetRec *p; 501 502 if (!charset) { 503 resolved = resolveLocale(locale); 504 if (!resolved) 505 return -1; 506 charset = strrchr(resolved, '.'); 507 if (charset) 508 charset++; 509 else 510 charset = resolved; 511 } 512 513 for (p = localeCharsets; p->name; p++) { 514 if (compare(p->name, charset) == 0) 515 break; 516 } 517 518 if (p->name == NULL) { 519 result = -1; 520 } else { 521 522 *gl_return = p->gl; 523 *gr_return = p->gr; 524 *g0_return = getCharsetByName(p->g0); 525 *g1_return = getCharsetByName(p->g1); 526 *g2_return = getCharsetByName(p->g2); 527 *g3_return = getCharsetByName(p->g3); 528 if (p->other) 529 *other_return = getCharsetByName(p->other); 530 else 531 *other_return = NULL; 532 } 533 if (resolved != 0) 534 free(resolved); 535 return result; 536} 537 538#ifdef NO_LEAKS 539static int 540isUnknownCharsetPtr(CharsetPtr p) 541{ 542 return (p == &Unknown94Charset 543 || p == &Unknown96Charset 544 || p == &Unknown9494Charset 545 || p == &Unknown9696Charset); 546} 547 548static void 549destroyFontencCharsetPtr(FontencCharsetPtr p) 550{ 551 p->mapping = 0; 552 553 /* 554 * This should, but does not work - 555 * FontMapReverseFree(p->reverse) 556 * 557 * The iteration for map[] is based on reading the source of 558 * FontMapReverse(). 559 */ 560 if (p->reverse) { 561 int n; 562 unsigned **map = p->reverse->data; 563 for (n = 0; n < 256; ++n) { 564 if (map[n]) 565 free(map[n]); 566 } 567 free(p->reverse->data); 568 free(p->reverse); 569 p->reverse = 0; 570 } 571} 572 573static void 574destroyCharset(CharsetPtr p) 575{ 576 if (!isUnknownCharsetPtr(p)) { 577 destroyFontencCharsetPtr(p->data); 578 free(p); 579 } 580} 581 582void 583charset_leaks(void) 584{ 585 while (cachedCharsets != 0) { 586 CharsetPtr next = cachedCharsets->next; 587 destroyCharset(cachedCharsets); 588 cachedCharsets = next; 589 } 590} 591#endif 592