1/* 2Copyright (c) 2001 by Juliusz Chroboczek 3 4Permission is hereby granted, free of charge, to any person obtaining a copy 5of this software and associated documentation files (the "Software"), to deal 6in the Software without restriction, including without limitation the rights 7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8copies of the Software, and to permit persons to whom the Software is 9furnished to do so, subject to the following conditions: 10 11The above copyright notice and this permission notice shall be included in 12all copies or substantial portions of the Software. 13 14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20THE SOFTWARE. 21*/ 22 23#ifdef HAVE_CONFIG_H 24# include "config.h" 25#endif 26 27#include <stdlib.h> 28#include <stdio.h> 29#include <string.h> 30#include <ctype.h> 31 32#include "sys.h" 33#include "other.h" 34#include "charset.h" 35#include "parser.h" 36 37#ifndef NULL 38#define NULL 0 39#endif 40 41static unsigned int 42IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED) 43{ 44 return n; 45} 46 47#ifdef UNUSED 48static int 49IdentityReverse(unsigned int n, const CharsetRec * self) 50{ 51#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 52 switch (self->type) { 53 case T_94: 54 case T_96: 55 if (IS_GL(n)) 56 return n; 57 else 58 return -1; 59 case T_128: 60 if (n < 0x80) 61 return n; 62 else 63 return -1; 64 case T_9494: 65 case T_9696: 66 if (IS_GL(n >> 8) && IS_GL(n & 0xFF)) 67 return n; 68 else 69 return -1; 70 case T_94192: 71 if (IS_GL(n >> 8) && IS_GL(n & 0x7F)) 72 return n; 73 else 74 return -1; 75 default: 76 abort(); 77 /* NOTREACHED */ 78 } 79#undef IS_GL 80} 81#endif 82 83static int 84NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED) 85{ 86 return -1; 87} 88 89static const CharsetRec Unknown94Charset = 90{"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 91static const CharsetRec Unknown96Charset = 92{"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 93static const CharsetRec Unknown9494Charset = 94{"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 95static const CharsetRec Unknown9696Charset = 96{"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0}; 97 98typedef struct _FontencCharset { 99 const char *name; 100 int type; 101 unsigned char final; 102 const char *xlfd; 103 unsigned shift; 104 FontMapPtr mapping; 105 FontMapReversePtr reverse; 106} FontencCharsetRec, *FontencCharsetPtr; 107/* *INDENT-OFF* */ 108static FontencCharsetRec fontencCharsets[] = 109{ 110 {"ISO 646 (1973)", T_94, '@', "iso646.1973-0", 0x00, 0, 0}, 111 {"ASCII", T_94, 'B', "iso8859-1", 0x00, 0, 0}, 112 {"JIS X 0201:GL", T_94, 'J', "jisx0201.1976-0", 0x00, 0, 0}, 113 {"JIS X 0201:GR", T_94, 'I', "jisx0201.1976-0", 0x80, 0, 0}, 114 {"DEC Special", T_94, '0', "dec-special", 0x00, 0, 0}, 115 {"DEC Technical", T_94, '>', "dec-dectech", 0x00, 0, 0}, 116 117 {"ISO 8859-1", T_96, 'A', "iso8859-1", 0x80, 0, 0}, 118 {"ISO 8859-2", T_96, 'B', "iso8859-2", 0x80, 0, 0}, 119 {"ISO 8859-3", T_96, 'C', "iso8859-3", 0x80, 0, 0}, 120 {"ISO 8859-4", T_96, 'D', "iso8859-4", 0x80, 0, 0}, 121 {"ISO 8859-5", T_96, 'L', "iso8859-5", 0x80, 0, 0}, 122 {"ISO 8859-6", T_96, 'G', "iso8859-6", 0x80, 0, 0}, 123 {"ISO 8859-7", T_96, 'F', "iso8859-7", 0x80, 0, 0}, 124 {"ISO 8859-8", T_96, 'H', "iso8859-8", 0x80, 0, 0}, 125 {"ISO 8859-9", T_96, 'M', "iso8859-9", 0x80, 0, 0}, 126 {"ISO 8859-10", T_96, 'V', "iso8859-10", 0x80, 0, 0}, 127 {"ISO 8859-11", T_96, 'T', "iso8859-11", 0x80, 0, 0}, 128 {"TIS 620", T_96, 'T', "iso8859-11", 0x80, 0, 0}, 129 {"ISO 8859-13", T_96, 'Y', "iso8859-13", 0x80, 0, 0}, 130 {"ISO 8859-14", T_96, '_', "iso8859-14", 0x80, 0, 0}, 131 {"ISO 8859-15", T_96, 'b', "iso8859-15", 0x80, 0, 0}, 132 {"ISO 8859-16", T_96, 'f', "iso8859-16", 0x80, 0, 0}, 133 {"KOI8-E", T_96, '@', "koi8-e", 0x80, 0, 0}, 134 {"TCVN", T_96, 'Z', "tcvn-0", 0x80, 0, 0}, 135 136 {"GB 2312", T_9494, 'A', "gb2312.1980-0", 0x0000, 0, 0}, 137 {"JIS X 0208", T_9494, 'B', "jisx0208.1990-0", 0x0000, 0, 0}, 138 {"KSC 5601", T_9494, 'C', "ksc5601.1987-0", 0x0000, 0, 0}, 139 {"JIS X 0212", T_9494, 'D', "jisx0212.1990-0", 0x0000, 0, 0}, 140 141 {"GB 2312", T_9696, 'A', "gb2312.1980-0", 0x0000, 0, 0}, 142 {"JIS X 0208", T_9696, 'B', "jisx0208.1990-0", 0x0000, 0, 0}, 143 {"KSC 5601", T_9696, 'C', "ksc5601.1987-0", 0x0000, 0, 0}, 144 {"JIS X 0212", T_9696, 'D', "jisx0212.1990-0", 0x0000, 0, 0}, 145 146 {"KOI8-R", T_128, 0, "koi8-r", 0x80, 0, 0}, 147 {"KOI8-U", T_128, 0, "koi8-u", 0x80, 0, 0}, 148 {"KOI8-RU", T_128, 0, "koi8-ru", 0x80, 0, 0}, 149 {"CP 1252", T_128, 0, "microsoft-cp1252", 0x80, 0, 0}, 150 {"CP 1251", T_128, 0, "microsoft-cp1251", 0x80, 0, 0}, 151 {"CP 1250", T_128, 0, "microsoft-cp1250", 0x80, 0, 0}, 152 153 {"CP 437", T_128, 0, "ibm-cp437", 0x80, 0, 0}, 154 {"CP 850", T_128, 0, "ibm-cp850", 0x80, 0, 0}, 155 {"CP 866", T_128, 0, "ibm-cp866", 0x80, 0, 0}, 156 157 {"Big 5", T_94192, 0, "big5.eten-0", 0x8000, 0, 0}, 158 {0, 0, 0, 0, 0, 0, 0} 159}; 160/* *INDENT-ON* */ 161 162typedef struct _OtherCharset { 163 const char *name; 164 int (*init) (OtherStatePtr); 165 unsigned int (*mapping) (unsigned int, OtherStatePtr); 166 unsigned int (*reverse) (unsigned int, OtherStatePtr); 167 int (*stack) (unsigned, OtherStatePtr); 168} OtherCharsetRec, *OtherCharsetPtr; 169 170static const OtherCharsetRec otherCharsets[] = 171{ 172 {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk}, 173 {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8}, 174 {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis}, 175 {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs}, 176 {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030}, 177 {0, 0, 0, 0, 0} 178}; 179 180static int 181compare(const char *s, const char *t) 182{ 183 while (*s || *t) { 184 if (*s && (isspace(UChar(*s)) || *s == '-' || *s == '_')) 185 s++; 186 else if (*t && (isspace(UChar(*t)) || *t == '-' || *t == '_')) 187 t++; 188 else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) { 189 s++; 190 t++; 191 } else 192 return 1; 193 } 194 return 0; 195} 196 197static unsigned int 198FontencCharsetRecode(unsigned int n, const CharsetRec * self) 199{ 200 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data); 201 202 return FontEncRecode(n + fc->shift, fc->mapping); 203} 204 205static int 206FontencCharsetReverse(unsigned int i, const CharsetRec * self) 207{ 208 const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data); 209 unsigned n; 210 211 n = fc->reverse->reverse(i, fc->reverse->data); 212 if (n == 0 || n < fc->shift) 213 return -1; 214 else 215 n -= fc->shift; 216 217#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80) 218 switch (self->type) { 219 case T_94: 220 case T_96: 221 if (IS_GL(n)) 222 return (int) n; 223 else 224 return -1; 225 case T_128: 226 if (n < 0x80) 227 return (int) n; 228 else 229 return -1; 230 case T_9494: 231 case T_9696: 232 if (IS_GL(n >> 8) && IS_GL(n & 0xFF)) 233 return (int) n; 234 else 235 return -1; 236 case T_94192: 237 if (IS_GL(n >> 8) && IS_GL(n & 0x7F)) 238 return (int) n; 239 else 240 return -1; 241 default: 242 abort(); 243 /* NOTREACHED */ 244 } 245#undef IS_GL 246} 247 248static CharsetPtr cachedCharsets = NULL; 249 250static CharsetPtr 251getCachedCharset(unsigned final, int type, const char *name) 252{ 253 CharsetPtr c; 254 for (c = cachedCharsets; c; c = c->next) { 255 if (((c->type == type && c->final == final) || 256 (name && !compare(c->name, name))) && 257 (c->type != T_FAILED)) 258 return c; 259 } 260 return NULL; 261} 262 263static void 264cacheCharset(CharsetPtr c) 265{ 266 c->next = cachedCharsets; 267 cachedCharsets = c; 268} 269 270static CharsetPtr 271getFontencCharset(unsigned final, int type, const char *name) 272{ 273 FontencCharsetPtr fc; 274 CharsetPtr c; 275 FontMapPtr mapping; 276 FontMapReversePtr reverse; 277 278 fc = fontencCharsets; 279 while (fc->name) { 280 if (((fc->type == type && fc->final == final) || 281 (name && !compare(fc->name, name))) && 282 (fc->type != T_FAILED)) 283 break; 284 fc++; 285 } 286 287 if (!fc->name) 288 return NULL; 289 290 c = malloc(sizeof(CharsetRec)); 291 if (c == NULL) 292 return NULL; 293 294 mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL); 295 if (!mapping) { 296 free(c); 297 fc->type = T_FAILED; 298 return NULL; 299 } 300 301 reverse = FontMapReverse(mapping); 302 if (!reverse) { 303 free(c); 304 fc->type = T_FAILED; 305 return NULL; 306 } 307 308 fc->mapping = mapping; 309 fc->reverse = reverse; 310 311 c->name = fc->name; 312 c->type = fc->type; 313 c->final = fc->final; 314 c->recode = FontencCharsetRecode; 315 c->reverse = FontencCharsetReverse; 316 c->data = fc; 317 318 cacheCharset(c); 319 return c; 320} 321 322static CharsetPtr 323getOtherCharset(const char *name) 324{ 325 const OtherCharsetRec *fc; 326 CharsetPtr c; 327 OtherStatePtr s; 328 329 fc = otherCharsets; 330 while (fc->name) { 331 if (name && !compare(fc->name, name)) 332 break; 333 fc++; 334 } 335 336 if (!fc->name) 337 return NULL; 338 339 c = malloc(sizeof(CharsetRec)); 340 if (c == NULL) 341 return NULL; 342 343 s = malloc(sizeof(OtherState)); 344 if (s == NULL) { 345 free(c); 346 return NULL; 347 } 348 349 c->name = fc->name; 350 c->type = T_OTHER; 351 c->final = 0; 352 c->data = fc; 353 c->other_recode = fc->mapping; 354 c->other_reverse = fc->reverse; 355 c->other_stack = fc->stack; 356 c->other_aux = s; 357 358 if (!fc->init(s)) { 359 c->type = T_FAILED; 360 return NULL; 361 } 362 363 cacheCharset(c); 364 return c; 365} 366 367const CharsetRec * 368getUnknownCharset(int type) 369{ 370 switch (type) { 371 case T_94: 372 return &Unknown94Charset; 373 case T_96: 374 return &Unknown96Charset; 375 case T_9494: 376 return &Unknown9494Charset; 377 case T_9696: 378 return &Unknown9696Charset; 379 default: 380 return &Unknown94Charset; 381 } 382} 383 384const CharsetRec * 385getCharset(unsigned final, int type) 386{ 387 const CharsetRec *c; 388 389 c = getCachedCharset(final, type, NULL); 390 if (c) 391 return c; 392 393 c = getFontencCharset(final, type, NULL); 394 if (c) 395 return c; 396 397 return getUnknownCharset(type); 398} 399 400const CharsetRec * 401getCharsetByName(const char *name) 402{ 403 const CharsetRec *c; 404 405 if (name == NULL) 406 return getUnknownCharset(T_94); 407 408 c = getCachedCharset(0, 0, name); 409 if (c) 410 return c; 411 412 c = getFontencCharset(0, 0, name); 413 if (c) 414 return c; 415 416 c = getOtherCharset(name); 417 if (c) 418 return c; 419 420 return getUnknownCharset(T_94); 421} 422/* *INDENT-OFF* */ 423static const LocaleCharsetRec localeCharsets[] = 424{ 425 {"C", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 426 {"POSIX", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 427 {"ISO8859-1", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL}, 428 {"ISO8859-2", 0, 2, "ASCII", NULL, "ISO 8859-2", NULL, NULL}, 429 {"ISO8859-3", 0, 2, "ASCII", NULL, "ISO 8859-3", NULL, NULL}, 430 {"ISO8859-4", 0, 2, "ASCII", NULL, "ISO 8859-4", NULL, NULL}, 431 {"ISO8859-5", 0, 2, "ASCII", NULL, "ISO 8859-5", NULL, NULL}, 432 {"ISO8859-6", 0, 2, "ASCII", NULL, "ISO 8859-6", NULL, NULL}, 433 {"ISO8859-7", 0, 2, "ASCII", NULL, "ISO 8859-7", NULL, NULL}, 434 {"ISO8859-8", 0, 2, "ASCII", NULL, "ISO 8859-8", NULL, NULL}, 435 {"ISO8859-9", 0, 2, "ASCII", NULL, "ISO 8859-9", NULL, NULL}, 436 {"ISO8859-10", 0, 2, "ASCII", NULL, "ISO 8859-10", NULL, NULL}, 437 {"ISO8859-11", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 438 {"TIS620", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL}, 439 {"ISO8859-13", 0, 2, "ASCII", NULL, "ISO 8859-13", NULL, NULL}, 440 {"ISO8859-14", 0, 2, "ASCII", NULL, "ISO 8859-14", NULL, NULL}, 441 {"ISO8859-15", 0, 2, "ASCII", NULL, "ISO 8859-15", NULL, NULL}, 442 {"ISO8859-16", 0, 2, "ASCII", NULL, "ISO 8859-16", NULL, NULL}, 443 {"KOI8-R", 0, 2, "ASCII", NULL, "KOI8-R", NULL, NULL}, 444 {"CP1251", 0, 2, "ASCII", NULL, "CP 1251", NULL, NULL}, 445 {"TCVN", 0, 2, "ASCII", NULL, "TCVN", NULL, NULL}, 446 {"eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 447 {"GB2312", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 448 {"eucJP", 0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL}, 449 {"eucKR", 0, 1, "ASCII", "KSC 5601", NULL, NULL, NULL}, 450 {"eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL}, 451 {"Big5", 0, 1, "ASCII", "Big 5", NULL, NULL, NULL}, 452 {"gbk", 0, 1, NULL, NULL, NULL, NULL, "GBK"}, 453 {"UTF-8", 0, 1, NULL, NULL, NULL, NULL, "UTF-8"}, 454 {"SJIS", 0, 1, NULL, NULL, NULL, NULL, "SJIS"}, 455 {"Big5-HKSCS", 0, 1, NULL, NULL, NULL, NULL, "BIG5-HKSCS"}, 456 {"gb18030", 0, 1, NULL, NULL, NULL, NULL, "GB18030"}, 457 {0, 0, 0, 0, 0, 0, 0, 0} 458}; 459/* *INDENT-ON* */ 460 461void 462reportCharsets(void) 463{ 464 const LocaleCharsetRec *p; 465 FontencCharsetPtr q; 466 printf("Known locale encodings:\n\n"); 467 for (p = localeCharsets; p->name; p++) { 468 if (p->other) { 469 printf(" %s (non-ISO-2022 encoding)\n", p->other); 470 continue; 471 } 472 printf(" %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr); 473 if (p->g0) 474 printf(", G0: %s", p->g0); 475 if (p->g1) 476 printf(", G1: %s", p->g1); 477 if (p->g2) 478 printf(", G2: %s", p->g2); 479 if (p->g3) 480 printf(", G3: %s", p->g3); 481 printf("\n"); 482 } 483 484 printf("\n\nKnown charsets (not all may be available):\n\n"); 485 for (q = fontencCharsets; q->name; q++) 486 printf(" %s%s\n", 487 q->name, q->final ? " (ISO 2022)" : ""); 488} 489 490int 491getLocaleState(const char *locale, 492 const char *charset, 493 int *gl_return, int *gr_return, 494 const CharsetRec * *g0_return, 495 const CharsetRec * *g1_return, 496 const CharsetRec * *g2_return, 497 const CharsetRec * *g3_return, 498 const CharsetRec * *other_return) 499{ 500 int result = 0; 501 char *resolved = 0; 502 const LocaleCharsetRec *p; 503 504 if (!charset) { 505 resolved = resolveLocale(locale); 506 if (!resolved) 507 return -1; 508 charset = strrchr(resolved, '.'); 509 if (charset) 510 charset++; 511 else 512 charset = resolved; 513 } 514 515 for (p = localeCharsets; p->name; p++) { 516 if (compare(p->name, charset) == 0) 517 break; 518 } 519 520 if (p->name == NULL) { 521 result = -1; 522 } else { 523 524 *gl_return = p->gl; 525 *gr_return = p->gr; 526 *g0_return = getCharsetByName(p->g0); 527 *g1_return = getCharsetByName(p->g1); 528 *g2_return = getCharsetByName(p->g2); 529 *g3_return = getCharsetByName(p->g3); 530 if (p->other) 531 *other_return = getCharsetByName(p->other); 532 else 533 *other_return = NULL; 534 } 535 if (resolved != 0) 536 free(resolved); 537 return result; 538} 539 540#ifdef NO_LEAKS 541static int 542isUnknownCharsetPtr(CharsetPtr p) 543{ 544 return (p == &Unknown94Charset 545 || p == &Unknown96Charset 546 || p == &Unknown9494Charset 547 || p == &Unknown9696Charset); 548} 549 550static void 551destroyFontencCharsetPtr(FontencCharsetPtr p) 552{ 553 p->mapping = 0; 554 555 /* 556 * This should, but does not work - 557 * FontMapReverseFree(p->reverse) 558 * 559 * The iteration for map[] is based on reading the source of 560 * FontMapReverse(). 561 */ 562 if (p->reverse) { 563 int n; 564 unsigned **map = p->reverse->data; 565 for (n = 0; n < 256; ++n) { 566 if (map[n]) 567 free(map[n]); 568 } 569 free(p->reverse->data); 570 free(p->reverse); 571 p->reverse = 0; 572 } 573} 574 575static void 576destroyCharset(CharsetPtr p) 577{ 578 if (!isUnknownCharsetPtr(p)) { 579 destroyFontencCharsetPtr(p->data); 580 free(p); 581 } 582} 583 584void 585charset_leaks(void) 586{ 587 while (cachedCharsets != 0) { 588 CharsetPtr next = cachedCharsets->next; 589 destroyCharset(cachedCharsets); 590 cachedCharsets = next; 591 } 592} 593#endif 594