lcCT.c revision 9c019ec5
1/* 2 * Copyright 1992, 1993 by TOSHIBA Corp. 3 * 4 * Permission to use, copy, modify, and distribute this software and its 5 * documentation for any purpose and without fee is hereby granted, provided 6 * that the above copyright notice appear in all copies and that both that 7 * copyright notice and this permission notice appear in supporting 8 * documentation, and that the name of TOSHIBA not be used in advertising 9 * or publicity pertaining to distribution of the software without specific, 10 * written prior permission. TOSHIBA make no representations about the 11 * suitability of this software for any purpose. It is provided "as is" 12 * without express or implied warranty. 13 * 14 * TOSHIBA DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING 15 * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL 16 * TOSHIBA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR 17 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 18 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 19 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 20 * SOFTWARE. 21 * 22 * Author: Katsuhisa Yano TOSHIBA Corp. 23 * mopi@osa.ilab.toshiba.co.jp 24 */ 25/* 26 * Copyright 1995 by FUJITSU LIMITED 27 * This is source code modified by FUJITSU LIMITED under the Joint 28 * Development Agreement for the CDE/Motif PST. 29 * 30 * Modifier: Takanori Tateno FUJITSU LIMITED 31 * 32 */ 33/* 34 * 2000 35 * Modifier: Ivan Pascal The XFree86 Project 36 * Modifier: Bruno Haible The XFree86 Project 37 */ 38 39#ifdef HAVE_CONFIG_H 40#include <config.h> 41#endif 42#include "Xlibint.h" 43#include "XlcPubI.h" 44#include <X11/Xos.h> 45#include <stdio.h> 46 47 48/* ====================== Built-in Character Sets ====================== */ 49 50/* 51 * Static representation of a character set that can be used in Compound Text. 52 */ 53typedef struct _CTDataRec { 54 const char name[19]; 55 const char ct_sequence[5]; /* Compound Text encoding, ESC sequence */ 56} CTDataRec, *CTData; 57 58static const CTDataRec default_ct_data[] = 59{ 60 /* */ 61 /* X11 registry name MIME name ISO-IR ESC sequence */ 62 /* */ 63 64 /* Registered character sets with one byte per character */ 65 { "ISO8859-1:GL", /* US-ASCII 6 */ "\033(B" }, 66 { "ISO8859-1:GR", /* ISO-8859-1 100 */ "\033-A" }, 67 { "ISO8859-2:GR", /* ISO-8859-2 101 */ "\033-B" }, 68 { "ISO8859-3:GR", /* ISO-8859-3 109 */ "\033-C" }, 69 { "ISO8859-4:GR", /* ISO-8859-4 110 */ "\033-D" }, 70 { "ISO8859-5:GR", /* ISO-8859-5 144 */ "\033-L" }, 71 { "ISO8859-6:GR", /* ISO-8859-6 127 */ "\033-G" }, 72 { "ISO8859-7:GR", /* ISO-8859-7 126 */ "\033-F" }, 73 { "ISO8859-8:GR", /* ISO-8859-8 138 */ "\033-H" }, 74 { "ISO8859-9:GR", /* ISO-8859-9 148 */ "\033-M" }, 75 { "ISO8859-10:GR", /* ISO-8859-10 157 */ "\033-V" }, 76 { "ISO8859-11:GR", /* ISO-8859-11 166 */ "\033-T" }, 77 { "ISO8859-13:GR", /* ISO-8859-13 179 */ "\033-Y" }, 78 { "ISO8859-14:GR", /* ISO-8859-14 199 */ "\033-_" }, 79 { "ISO8859-15:GR", /* ISO-8859-15 203 */ "\033-b" }, 80 { "ISO8859-16:GR", /* ISO-8859-16 226 */ "\033-f" }, 81 { "JISX0201.1976-0:GL", /* ISO-646-JP 14 */ "\033(J" }, 82 { "JISX0201.1976-0:GR", "\033)I" }, 83#if 0 84 { "TIS620-0:GR", /* TIS-620 166 */ "\033-T" }, 85#endif 86 87 /* Registered character sets with two byte per character */ 88 { "GB2312.1980-0:GL", /* GB_2312-80 58 */ "\033$(A" }, 89 { "GB2312.1980-0:GR", /* GB_2312-80 58 */ "\033$)A" }, 90 { "JISX0208.1983-0:GL", /* JIS_X0208-1983 87 */ "\033$(B" }, 91 { "JISX0208.1983-0:GR", /* JIS_X0208-1983 87 */ "\033$)B" }, 92 { "JISX0208.1990-0:GL", /* JIS_X0208-1990 168 */ "\033$(B" }, 93 { "JISX0208.1990-0:GR", /* JIS_X0208-1990 168 */ "\033$)B" }, 94 { "JISX0212.1990-0:GL", /* JIS_X0212-1990 159 */ "\033$(D" }, 95 { "JISX0212.1990-0:GR", /* JIS_X0212-1990 159 */ "\033$)D" }, 96 { "KSC5601.1987-0:GL", /* KS_C_5601-1987 149 */ "\033$(C" }, 97 { "KSC5601.1987-0:GR", /* KS_C_5601-1987 149 */ "\033$)C" }, 98 { "CNS11643.1986-1:GL", /* CNS 11643-1992 pl.1 171 */ "\033$(G" }, 99 { "CNS11643.1986-1:GR", /* CNS 11643-1992 pl.1 171 */ "\033$)G" }, 100 { "CNS11643.1986-2:GL", /* CNS 11643-1992 pl.2 172 */ "\033$(H" }, 101 { "CNS11643.1986-2:GR", /* CNS 11643-1992 pl.2 172 */ "\033$)H" }, 102 { "CNS11643.1992-3:GL", /* CNS 11643-1992 pl.3 183 */ "\033$(I" }, 103 { "CNS11643.1992-3:GR", /* CNS 11643-1992 pl.3 183 */ "\033$)I" }, 104 { "CNS11643.1992-4:GL", /* CNS 11643-1992 pl.4 184 */ "\033$(J" }, 105 { "CNS11643.1992-4:GR", /* CNS 11643-1992 pl.4 184 */ "\033$)J" }, 106 { "CNS11643.1992-5:GL", /* CNS 11643-1992 pl.5 185 */ "\033$(K" }, 107 { "CNS11643.1992-5:GR", /* CNS 11643-1992 pl.5 185 */ "\033$)K" }, 108 { "CNS11643.1992-6:GL", /* CNS 11643-1992 pl.6 186 */ "\033$(L" }, 109 { "CNS11643.1992-6:GR", /* CNS 11643-1992 pl.6 186 */ "\033$)L" }, 110 { "CNS11643.1992-7:GL", /* CNS 11643-1992 pl.7 187 */ "\033$(M" }, 111 { "CNS11643.1992-7:GR", /* CNS 11643-1992 pl.7 187 */ "\033$)M" }, 112 113 /* Registered encodings with a varying number of bytes per character */ 114 { "ISO10646-1", /* UTF-8 196 */ "\033%G" }, 115 116 /* Encodings without ISO-IR assigned escape sequence must be 117 defined in XLC_LOCALE files, using "\033%/1" or "\033%/2". */ 118 119 /* Backward compatibility with XFree86 3.x */ 120#if 1 121 { "ISO8859-14:GR", "\033%/1" }, 122 { "ISO8859-15:GR", "\033%/1" }, 123#endif 124 /* For use by utf8 -> ctext */ 125 { "BIG5-0:GLGR", "\033%/2"}, 126 { "BIG5HKSCS-0:GLGR", "\033%/2"}, 127 { "GBK-0:GLGR", "\033%/2"}, 128 /* used by Emacs, but not backed by ISO-IR */ 129 { "BIG5-E0:GL", "\033$(0" }, 130 { "BIG5-E0:GR", "\033$)0" }, 131 { "BIG5-E1:GL", "\033$(1" }, 132 { "BIG5-E1:GR", "\033$)1" }, 133 134}; 135 136/* We represent UTF-8 as an XlcGLGR charset, not in extended segments. */ 137#define UTF8_IN_EXTSEQ 0 138 139/* ======================= Parsing ESC Sequences ======================= */ 140 141#define XctC0 0x0000 142#define XctHT 0x0009 143#define XctNL 0x000a 144#define XctESC 0x001b 145#define XctGL 0x0020 146#define XctC1 0x0080 147#define XctCSI 0x009b 148#define XctGR 0x00a0 149#define XctSTX 0x0002 150 151#define XctCntrlFunc 0x0023 152#define XctMB 0x0024 153#define XctOtherCoding 0x0025 154#define XctGL94 0x0028 155#define XctGR94 0x0029 156#define XctGR96 0x002d 157#define XctNonStandard 0x002f 158#define XctIgnoreExt 0x0030 159#define XctNotIgnoreExt 0x0031 160#define XctLeftToRight 0x0031 161#define XctRightToLeft 0x0032 162#define XctDirection 0x005d 163#define XctDirectionEnd 0x005d 164 165#define XctGL94MB 0x2428 166#define XctGR94MB 0x2429 167#define XctExtSeg 0x252f 168#define XctReturn 0x2540 169 170/* 171 * Parses the header of a Compound Text segment, i.e. the charset designator. 172 * The string starts at *text and has *length bytes. 173 * Return value is one of: 174 * 0 (no valid charset designator), 175 * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, 176 * XctLeftToRight, XctRightToLeft, XctDirectionEnd, 177 * XctExtSeg, XctOtherCoding, XctReturn, XctIgnoreExt, XctNotIgnoreExt. 178 * If the return value is not 0, *text is incremented and *length decremented, 179 * to point past the charset designator. If the return value is one of 180 * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, 181 * XctExtSeg, XctOtherCoding, XctIgnoreExt, XctNotIgnoreExt, 182 * *final_byte is set to the "final byte" of the charset designator. 183 */ 184static unsigned int 185_XlcParseCT( 186 const char **text, 187 int *length, 188 unsigned char *final_byte) 189{ 190 unsigned int ret = 0; 191 unsigned char ch; 192 const unsigned char *str = (const unsigned char *) *text; 193 194 *final_byte = 0; 195 196 if (*length < 1) 197 return 0; 198 switch (ch = *str++) { 199 case XctESC: 200 if (*length < 2) 201 return 0; 202 switch (ch = *str++) { 203 case XctOtherCoding: /* % */ 204 if (*length < 3) 205 return 0; 206 ch = *str++; 207 if (ch == XctNonStandard) { /* / */ 208 if (*length < 4) 209 return 0; 210 ret = XctExtSeg; 211 ch = *str++; 212 } else if (ch == '@') { 213 ret = XctReturn; 214 } else { 215 ret = XctOtherCoding; 216 } 217 *final_byte = ch; 218 break; 219 220 case XctCntrlFunc: /* # */ 221 if (*length < 4) 222 return 0; 223 *final_byte = *str++; 224 switch (*str++) { 225 case XctIgnoreExt: /* 0 */ 226 ret = XctIgnoreExt; 227 break; 228 case XctNotIgnoreExt: /* 1 */ 229 ret = XctNotIgnoreExt; 230 break; 231 default: 232 ret = 0; 233 break; 234 } 235 break; 236 237 case XctMB: /* $ */ 238 if (*length < 4) 239 return 0; 240 ch = *str++; 241 switch (ch) { 242 case XctGL94: /* ( */ 243 ret = XctGL94MB; 244 break; 245 case XctGR94: /* ) */ 246 ret = XctGR94MB; 247 break; 248 default: 249 ret = 0; 250 break; 251 } 252 *final_byte = *str++; 253 break; 254 255 case XctGL94: /* ( */ 256 if (*length < 3) 257 return 0; 258 ret = XctGL94; 259 *final_byte = *str++; 260 break; 261 case XctGR94: /* ) */ 262 if (*length < 3) 263 return 0; 264 ret = XctGR94; 265 *final_byte = *str++; 266 break; 267 case XctGR96: /* - */ 268 if (*length < 3) 269 return 0; 270 ret = XctGR96; 271 *final_byte = *str++; 272 break; 273 } 274 break; 275 case XctCSI: 276 /* direction */ 277 if (*length < 2) 278 return 0; 279 switch (*str++) { 280 case XctLeftToRight: 281 if (*length < 3) 282 return 0; 283 if (*str++ == XctDirection) 284 ret = XctLeftToRight; 285 break; 286 case XctRightToLeft: 287 if (*length < 3) 288 return 0; 289 if (*str++ == XctDirection) 290 ret = XctRightToLeft; 291 break; 292 case XctDirectionEnd: 293 ret = XctDirectionEnd; 294 break; 295 } 296 break; 297 } 298 299 if (ret) { 300 *length -= (const char *) str - *text; 301 *text = (const char *) str; 302 } 303 return ret; 304} 305 306/* 307 * Fills into a freshly created XlcCharSet the fields that can be inferred 308 * from the ESC sequence. These are side, char_size, set_size. 309 * Returns True if the charset can be used with Compound Text. 310 * 311 * Used by _XlcCreateDefaultCharSet. 312 */ 313Bool 314_XlcParseCharSet( 315 XlcCharSet charset) 316{ 317 unsigned int type; 318 unsigned char final_byte; 319 const char *ptr = charset->ct_sequence; 320 int length; 321 int char_size; 322 323 if (*ptr == '\0') 324 return False; 325 326 length = (int) strlen(ptr); 327 328 type = _XlcParseCT(&ptr, &length, &final_byte); 329 330 /* Check for validity and determine char_size. 331 char_size = 0 means varying number of bytes per character. */ 332 switch (type) { 333 case XctGL94: 334 case XctGR94: 335 case XctGR96: 336 char_size = 1; 337 break; 338 case XctGL94MB: 339 case XctGR94MB: 340 char_size = (final_byte < 0x60 ? 2 : final_byte < 0x70 ? 3 : 4); 341 break; 342 case XctExtSeg: 343 char_size = final_byte - '0'; 344 if (!(char_size >= 0 && char_size <= 4)) 345 return False; 346 break; 347 case XctOtherCoding: 348 char_size = 0; 349 break; 350 default: 351 return False; 352 } 353 354 charset->char_size = char_size; 355 356 /* Fill in other values. */ 357 switch (type) { 358 case XctGL94: 359 case XctGL94MB: 360 charset->side = XlcGL; 361 charset->set_size = 94; 362 break; 363 case XctGR94: 364 case XctGR94MB: 365 charset->side = XlcGR; 366 charset->set_size = 94; 367 break; 368 case XctGR96: 369 charset->side = XlcGR; 370 charset->set_size = 96; 371 break; 372 case XctExtSeg: 373 case XctOtherCoding: 374 charset->side = XlcGLGR; 375 charset->set_size = 0; 376 break; 377 } 378 return True; 379} 380 381 382/* =============== Management of the List of Character Sets =============== */ 383 384/* 385 * Representation of a character set that can be used for Compound Text, 386 * at run time. 387 * Note: This information is not contained in the XlcCharSet, because 388 * multiple ESC sequences may be used for the same XlcCharSet. 389 */ 390typedef struct _CTInfoRec { 391 XlcCharSet charset; 392 const char *ct_sequence; /* Compound Text ESC sequence */ 393 unsigned int type; 394 unsigned char final_byte; 395 /* If type == XctExtSeg: */ 396 const char *ext_segment; /* extended segment name, then '\002' */ 397 int ext_segment_len; /* length of above, including final '\002' */ 398 399 struct _CTInfoRec *next; 400} CTInfoRec, *CTInfo; 401 402/* 403 * List of character sets that can be used for Compound Text, 404 * Includes all that are listed in default_ct_data, but more can be added 405 * at runtime through _XlcAddCT. 406 */ 407static CTInfo ct_list = NULL; 408static CTInfo ct_list_end = NULL; 409 410/* 411 * Returns a Compound Text info record for an ESC sequence. 412 * The first part of the ESC sequence has already been parsed into 'type' 413 * and 'final_byte'. The remainder starts at 'text', at least 'text_len' 414 * bytes (only used if type == XctExtSeg). 415 */ 416static CTInfo 417_XlcGetCTInfo( 418 unsigned int type, 419 unsigned char final_byte, 420 const char *text, 421 int text_len) 422{ 423 CTInfo ct_info; 424 425 for (ct_info = ct_list; ct_info; ct_info = ct_info->next) 426 if (ct_info->type == type 427 && ct_info->final_byte == final_byte 428 && (type != XctExtSeg 429 || (text_len >= ct_info->ext_segment_len 430 && memcmp(text, ct_info->ext_segment, 431 (size_t) ct_info->ext_segment_len) == 0))) 432 return ct_info; 433 434 return (CTInfo) NULL; 435} 436 437/* Returns the Compound Text info for a given XlcCharSet. 438 Returns NULL if none is found. */ 439static CTInfo 440_XlcGetCTInfoFromCharSet( 441 XlcCharSet charset) 442{ 443 CTInfo ct_info; 444 445 for (ct_info = ct_list; ct_info; ct_info = ct_info->next) 446 if (ct_info->charset == charset) 447 return ct_info; 448 449 return (CTInfo) NULL; 450} 451 452/* Creates a new XlcCharSet, given its name (including side suffix) and 453 Compound Text ESC sequence (normally at most 4 bytes), and makes it 454 eligible for Compound Text processing. */ 455XlcCharSet 456_XlcAddCT( 457 const char *name, 458 const char *ct_sequence) 459{ 460 CTInfo ct_info, existing_info; 461 XlcCharSet charset; 462 const char *ct_ptr; 463 int length; 464 unsigned int type; 465 unsigned char final_byte; 466 467 charset = _XlcGetCharSet(name); 468 if (charset != NULL) { 469 /* Even if the charset already exists, it is OK to register a second 470 Compound Text sequence for it. */ 471 } else { 472 /* Attempt to create the charset. */ 473 charset = _XlcCreateDefaultCharSet(name, ct_sequence); 474 if (charset == NULL) 475 return (XlcCharSet) NULL; 476 _XlcAddCharSet(charset); 477 } 478 479 /* Allocate a CTinfo record. */ 480 length = (int) strlen(ct_sequence); 481 ct_info = Xmalloc(sizeof(CTInfoRec) + length+1); 482 if (ct_info == NULL) 483 return charset; 484 485 ct_info->charset = charset; 486 ct_info->ct_sequence = strcpy((char *) (ct_info + 1), ct_sequence); 487 488 /* Parse the Compound Text sequence. */ 489 ct_ptr = ct_sequence; 490 type = _XlcParseCT(&ct_ptr, &length, &final_byte); 491 492 ct_info->type = type; 493 ct_info->final_byte = final_byte; 494 495 switch (type) { 496 case XctGL94: 497 case XctGR94: 498 case XctGR96: 499 case XctGL94MB: 500 case XctGR94MB: 501 case XctOtherCoding: 502 ct_info->ext_segment = NULL; 503 ct_info->ext_segment_len = 0; 504 break; 505 case XctExtSeg: { 506 /* By convention, the extended segment name is the encoding_name 507 in lowercase. */ 508 const char *q = charset->encoding_name; 509 int n = (int) strlen(q); 510 char *p; 511 512 /* Ensure ct_info->ext_segment_len <= 0x3fff - 6. */ 513 if (n > 0x3fff - 6 - 1) { 514 Xfree(ct_info); 515 return charset; 516 } 517 p = Xmalloc(n+1); 518 if (p == NULL) { 519 Xfree(ct_info); 520 return charset; 521 } 522 ct_info->ext_segment = p; 523 ct_info->ext_segment_len = n+1; 524 for ( ; n > 0; p++, q++, n--) 525 *p = (*q >= 'A' && *q <= 'Z' ? *q - 'A' + 'a' : *q); 526 *p = XctSTX; 527 break; 528 } 529 default: 530 Xfree(ct_info); 531 return (XlcCharSet) NULL; 532 } 533 534 /* Insert it into the list, if not already present. */ 535 existing_info = 536 _XlcGetCTInfo(type, ct_info->final_byte, 537 ct_info->ext_segment, ct_info->ext_segment_len); 538 if (existing_info == NULL) { 539 /* Insert it at the end. If there are duplicates CTinfo entries 540 for the same XlcCharSet, we want the first (standard) one to 541 override the second (user defined) one. */ 542 ct_info->next = NULL; 543 if (ct_list_end) 544 ct_list_end->next = ct_info; 545 else 546 ct_list = ct_info; 547 ct_list_end = ct_info; 548 } else { 549 if (existing_info->charset != charset 550 /* We have a conflict, with one exception: JISX0208.1983-0 and 551 JISX0208.1990-0 are the same for all practical purposes. */ 552 && !(strncmp(existing_info->charset->name, "JISX0208", 8) == 0 553 && strncmp(charset->name, "JISX0208", 8) == 0)) { 554 fprintf(stderr, 555 "Xlib: charsets %s and %s have the same CT sequence\n", 556 charset->name, existing_info->charset->name); 557 if (strcmp(charset->ct_sequence, ct_sequence) == 0) 558 charset->ct_sequence = ""; 559 } 560 Xfree(ct_info); 561 } 562 563 return charset; 564} 565 566 567/* ========== Converters String <--> CharSet <--> Compound Text ========== */ 568 569/* 570 * Structure representing the parse state of a Compound Text string. 571 */ 572typedef struct _StateRec { 573 XlcCharSet charset; /* The charset of the current segment */ 574 XlcCharSet GL_charset; /* The charset responsible for 0x00..0x7F */ 575 XlcCharSet GR_charset; /* The charset responsible for 0x80..0xFF */ 576 XlcCharSet Other_charset; /* != NULL if currently in an other segment */ 577 int ext_seg_left; /* > 0 if currently in an extended segment */ 578} StateRec, *State; 579 580 581/* Subroutine for parsing an ESC sequence. */ 582 583typedef enum { 584 resOK, /* Charset saved in 'state', sequence skipped */ 585 resNotInList, /* Charset not found, sequence skipped */ 586 resNotCTSeq /* EscSeq not recognized, pointers not changed */ 587} CheckResult; 588 589static CheckResult 590_XlcCheckCTSequence( 591 State state, 592 const char **ctext, 593 int *ctext_len) 594{ 595 XlcCharSet charset; 596 CTInfo ct_info; 597 const char *tmp_ctext = *ctext; 598 int tmp_ctext_len = *ctext_len; 599 unsigned int type; 600 unsigned char final_byte; 601 int ext_seg_left = 0; 602 603 /* Check for validity. */ 604 type = _XlcParseCT(&tmp_ctext, &tmp_ctext_len, &final_byte); 605 606 switch (type) { 607 case XctGL94: 608 case XctGR94: 609 case XctGR96: 610 case XctGL94MB: 611 case XctGR94MB: 612 case XctOtherCoding: 613 *ctext = tmp_ctext; 614 *ctext_len = tmp_ctext_len; 615 break; 616 case XctReturn: 617 *ctext = tmp_ctext; 618 *ctext_len = tmp_ctext_len; 619 state->Other_charset = NULL; 620 return resOK; 621 case XctExtSeg: 622 if (tmp_ctext_len > 2 623 && (tmp_ctext[0] & 0x80) && (tmp_ctext[0] & 0x80)) { 624 unsigned int msb = tmp_ctext[0] & 0x7f; 625 unsigned int lsb = tmp_ctext[1] & 0x7f; 626 ext_seg_left = (msb << 7) + lsb; 627 if (ext_seg_left <= tmp_ctext_len - 2) { 628 *ctext = tmp_ctext + 2; 629 *ctext_len = tmp_ctext_len - 2; 630 break; 631 } 632 } 633 return resNotCTSeq; 634 default: 635 return resNotCTSeq; 636 } 637 638 ct_info = _XlcGetCTInfo(type, final_byte, *ctext, ext_seg_left); 639 640 if (ct_info) { 641 charset = ct_info->charset; 642 state->ext_seg_left = ext_seg_left; 643 if (type == XctExtSeg) { 644 state->charset = charset; 645 /* Skip past the extended segment name and the separator. */ 646 *ctext += ct_info->ext_segment_len; 647 *ctext_len -= ct_info->ext_segment_len; 648 state->ext_seg_left -= ct_info->ext_segment_len; 649 } else if (type == XctOtherCoding) { 650 state->Other_charset = charset; 651 } else { 652 if (charset->side == XlcGL) { 653 state->GL_charset = charset; 654 } else if (charset->side == XlcGR) { 655 state->GR_charset = charset; 656 } else { 657 state->GL_charset = charset; 658 state->GR_charset = charset; 659 } 660 } 661 return resOK; 662 } else { 663 state->ext_seg_left = 0; 664 if (type == XctExtSeg) { 665 /* Skip the entire extended segment. */ 666 *ctext += ext_seg_left; 667 *ctext_len -= ext_seg_left; 668 } 669 return resNotInList; 670 } 671} 672 673static void 674init_state( 675 XlcConv conv) 676{ 677 State state = (State) conv->state; 678 static XlcCharSet default_GL_charset = NULL; 679 static XlcCharSet default_GR_charset = NULL; 680 681 if (default_GL_charset == NULL) { 682 default_GL_charset = _XlcGetCharSet("ISO8859-1:GL"); 683 default_GR_charset = _XlcGetCharSet("ISO8859-1:GR"); 684 } 685 686 /* The initial state is ISO-8859-1 on both sides. */ 687 state->GL_charset = state->charset = default_GL_charset; 688 state->GR_charset = default_GR_charset; 689 690 state->Other_charset = NULL; 691 692 state->ext_seg_left = 0; 693} 694 695/* from XlcNCompoundText to XlcNCharSet */ 696 697static int 698cttocs( 699 XlcConv conv, 700 XPointer *from, 701 int *from_left, 702 XPointer *to, 703 int *to_left, 704 XPointer *args, 705 int num_args) 706{ 707 State state = (State) conv->state; 708 XlcCharSet charset = NULL; 709 const char *ctptr; 710 char *bufptr; 711 int ctext_len, buf_len; 712 int unconv_num = 0; 713 714 ctptr = (const char *) *from; 715 bufptr = (char *) *to; 716 ctext_len = *from_left; 717 buf_len = *to_left; 718 719 while (ctext_len > 0 && buf_len > 0) { 720 if (state->ext_seg_left == 0) { 721 /* Not in the middle of an extended segment; look at next byte. */ 722 unsigned char ch = *ctptr; 723 XlcCharSet ch_charset; 724 725 if (ch == XctESC) { 726 CheckResult ret = 727 _XlcCheckCTSequence(state, &ctptr, &ctext_len); 728 if (ret == resOK) 729 /* state has been modified. */ 730 continue; 731 if (ret == resNotInList) { 732 /* XXX Just continue with previous charset. */ 733 unconv_num++; 734 continue; 735 } 736 } else if (ch == XctCSI) { 737 /* XXX Simply ignore the XctLeftToRight, XctRightToLeft, 738 XctDirectionEnd sequences for the moment. */ 739 unsigned char dummy; 740 if (_XlcParseCT(&ctptr, &ctext_len, &dummy)) { 741 unconv_num++; 742 continue; 743 } 744 } 745 746 /* Find the charset which is responsible for this byte. */ 747 ch_charset = (state->Other_charset != NULL ? state->Other_charset : 748 (ch & 0x80 ? state->GR_charset : state->GL_charset)); 749 750 /* Set the charset of this run, or continue the current run, 751 or stop the current run. */ 752 if (charset) { 753 if (charset != ch_charset) 754 break; 755 } else { 756 state->charset = charset = ch_charset; 757 } 758 759 /* We don't want to split a character into multiple pieces. */ 760 if (buf_len < 6) { 761 if (charset->char_size > 0) { 762 if (buf_len < charset->char_size) 763 break; 764 } else { 765 /* char_size == 0 is tricky. The code here is good only 766 for valid UTF-8 input. */ 767 if (charset->ct_sequence[0] == XctESC 768 && charset->ct_sequence[1] == XctOtherCoding 769 && charset->ct_sequence[2] == 'G') { 770 int char_size = (ch < 0xc0 ? 1 : 771 ch < 0xe0 ? 2 : 772 ch < 0xf0 ? 3 : 773 ch < 0xf8 ? 4 : 774 ch < 0xfc ? 5 : 775 6); 776 if (buf_len < char_size) 777 break; 778 } 779 } 780 } 781 782 *bufptr++ = *ctptr++; 783 ctext_len--; 784 buf_len--; 785 } else { 786 /* Copy as much as possible from the current extended segment 787 to the buffer. */ 788 int char_size; 789 790 /* Set the charset of this run, or continue the current run, 791 or stop the current run. */ 792 if (charset) { 793 if (charset != state->charset) 794 break; 795 } else { 796 charset = state->charset; 797 } 798 799 char_size = charset->char_size; 800 801 if (state->ext_seg_left <= buf_len || char_size > 0) { 802 int n = (state->ext_seg_left <= buf_len 803 ? state->ext_seg_left 804 : (buf_len / char_size) * char_size); 805 memcpy(bufptr, ctptr, (size_t) n); 806 ctptr += n; ctext_len -= n; 807 bufptr += n; buf_len -= n; 808 state->ext_seg_left -= n; 809 } else { 810#if UTF8_IN_EXTSEQ 811 /* char_size == 0 is tricky. The code here is good only 812 for valid UTF-8 input. */ 813 if (strcmp(charset->name, "ISO10646-1") == 0) { 814 unsigned char ch = *ctptr; 815 int char_size = (ch < 0xc0 ? 1 : 816 ch < 0xe0 ? 2 : 817 ch < 0xf0 ? 3 : 818 ch < 0xf8 ? 4 : 819 ch < 0xfc ? 5 : 820 6); 821 int i; 822 if (buf_len < char_size) 823 break; 824 /* A small loop is faster than calling memcpy. */ 825 for (i = char_size; i > 0; i--) 826 *bufptr++ = *ctptr++; 827 ctext_len -= char_size; 828 buf_len -= char_size; 829 state->ext_seg_left -= char_size; 830 } else 831#endif 832 { 833 /* Here ctext_len >= state->ext_seg_left > buf_len. 834 We may be splitting a character into multiple pieces. 835 Oh well. */ 836 int n = buf_len; 837 memcpy(bufptr, ctptr, (size_t) n); 838 ctptr += n; ctext_len -= n; 839 bufptr += n; buf_len -= n; 840 state->ext_seg_left -= n; 841 } 842 } 843 } 844 } 845 846 /* 'charset' is the charset for the current run. In some cases, 847 'state->charset' contains the charset for the next run. Therefore, 848 return 'charset'. 849 'charset' may still be NULL only if no output was produced. */ 850 if (num_args > 0) 851 *((XlcCharSet *) args[0]) = charset; 852 853 *from_left -= ctptr - *((const char **) from); 854 *from = (XPointer) ctptr; 855 856 *to_left -= bufptr - *((char **) to); 857 *to = (XPointer) bufptr; 858 859 return unconv_num; 860} 861 862/* from XlcNCharSet to XlcNCompoundText */ 863 864static int 865cstoct( 866 XlcConv conv, 867 XPointer *from, 868 int *from_left, 869 XPointer *to, 870 int *to_left, 871 XPointer *args, 872 int num_args) 873{ 874 State state = (State) conv->state; 875 XlcSide side; 876 unsigned char min_ch = 0, max_ch = 0; 877 int length, unconv_num; 878 CTInfo ct_info; 879 XlcCharSet charset; 880 const char *csptr; 881 char *ctptr; 882 int csstr_len, ct_len; 883 char *ext_segment_start; 884 int char_size; 885 886 /* One argument is required, of type XlcCharSet. */ 887 if (num_args < 1) 888 return -1; 889 890 csptr = *((const char **) from); 891 ctptr = *((char **) to); 892 csstr_len = *from_left; 893 ct_len = *to_left; 894 895 charset = (XlcCharSet) args[0]; 896 897 ct_info = _XlcGetCTInfoFromCharSet(charset); 898 if (ct_info == NULL) 899 return -1; 900 901 side = charset->side; 902 length = (int) strlen(ct_info->ct_sequence); 903 904 ext_segment_start = NULL; 905 906 if (ct_info->type == XctOtherCoding) { 907 /* Output the Escape sequence for switching to the charset, and 908 reserve room now for the XctReturn sequence at the end. */ 909 if (ct_len < length + 3) 910 return -1; 911 912 memcpy(ctptr, ct_info->ct_sequence, (size_t) length); 913 ctptr += length; 914 ct_len -= length + 3; 915 } else 916 /* Test whether the charset is already active. */ 917 if (((side == XlcGR || side == XlcGLGR) 918 && charset != state->GR_charset) 919 || ((side == XlcGL || side == XlcGLGR) 920 && charset != state->GL_charset)) { 921 922 /* Output the Escape sequence for switching to the charset. */ 923 if (ct_info->type == XctExtSeg) { 924 if (ct_len < length + 2 + ct_info->ext_segment_len) 925 return -1; 926 927 memcpy(ctptr, ct_info->ct_sequence, (size_t) length); 928 ctptr += length; 929 ct_len -= length; 930 931 ctptr += 2; 932 ct_len -= 2; 933 ext_segment_start = ctptr; 934 935 /* The size of an extended segment must fit in 14 bits. */ 936 if (ct_len > 0x3fff) 937 ct_len = 0x3fff; 938 939 memcpy(ctptr, ct_info->ext_segment, (size_t) ct_info->ext_segment_len); 940 ctptr += ct_info->ext_segment_len; 941 ct_len -= ct_info->ext_segment_len; 942 } else { 943 if (ct_len < length) 944 return -1; 945 946 memcpy(ctptr, ct_info->ct_sequence, (size_t) length); 947 ctptr += length; 948 ct_len -= length; 949 } 950 } 951 952 /* If the charset has side GL or GR, prepare remapping the characters 953 to the correct side. */ 954 if (charset->set_size) { 955 min_ch = 0x20; 956 max_ch = 0x7f; 957 if (charset->set_size == 94) { 958 max_ch--; 959 if (charset->char_size > 1 || side == XlcGR) 960 min_ch++; 961 } 962 } 963 964 /* Actually copy the contents. */ 965 unconv_num = 0; 966 char_size = charset->char_size; 967 if (char_size == 1) { 968 while (csstr_len > 0 && ct_len > 0) { 969 if (charset->set_size) { 970 /* The CompoundText specification says that the only 971 control characters allowed are 0x09, 0x0a, 0x1b, 0x9b. 972 Therefore here we eliminate other control characters. */ 973 unsigned char ch = *((const unsigned char *) csptr) & 0x7f; 974 if (!((ch >= min_ch && ch <= max_ch) 975 || (side == XlcGL 976 && (ch == 0x00 || ch == 0x09 || ch == 0x0a)) 977 || ((side == XlcGL || side == XlcGR) 978 && (ch == 0x1b)))) { 979 csptr++; 980 csstr_len--; 981 unconv_num++; 982 continue; 983 } 984 } 985 986 if (side == XlcGL) 987 *ctptr++ = *csptr++ & 0x7f; 988 else if (side == XlcGR) 989 *ctptr++ = *csptr++ | 0x80; 990 else 991 *ctptr++ = *csptr++; 992 csstr_len--; 993 ct_len--; 994 } 995 } else if (char_size > 1) { 996 while (csstr_len >= char_size && ct_len >= char_size) { 997 if (side == XlcGL) { 998 int i; 999 for (i = char_size; i > 0; i--) 1000 *ctptr++ = *csptr++ & 0x7f; 1001 } else if (side == XlcGR) { 1002 int i; 1003 for (i = char_size; i > 0; i--) 1004 *ctptr++ = *csptr++ | 0x80; 1005 } else { 1006 int i; 1007 for (i = char_size; i > 0; i--) 1008 *ctptr++ = *csptr++; 1009 } 1010 csstr_len -= char_size; 1011 ct_len -= char_size; 1012 } 1013 } else { 1014 /* char_size = 0. The code here is good only for valid UTF-8 input. */ 1015 if ((charset->ct_sequence[0] == XctESC 1016 && charset->ct_sequence[1] == XctOtherCoding 1017 && charset->ct_sequence[2] == 'G') 1018#if UTF8_IN_EXTSEQ 1019 || strcmp(charset->name, "ISO10646-1") == 0 1020#endif 1021 ) { 1022 while (csstr_len > 0 && ct_len > 0) { 1023 unsigned char ch = * (const unsigned char *) csptr; 1024 int ch_size = (ch < 0xc0 ? 1 : 1025 ch < 0xe0 ? 2 : 1026 ch < 0xf0 ? 3 : 1027 ch < 0xf8 ? 4 : 1028 ch < 0xfc ? 5 : 1029 6); 1030 int i; 1031 if (!(csstr_len >= ch_size && ct_len >= ch_size)) 1032 break; 1033 for (i = ch_size; i > 0; i--) 1034 *ctptr++ = *csptr++; 1035 csstr_len -= ch_size; 1036 ct_len -= ch_size; 1037 } 1038 } else { 1039 while (csstr_len > 0 && ct_len > 0) { 1040 *ctptr++ = *csptr++; 1041 csstr_len--; 1042 ct_len--; 1043 } 1044 } 1045 } 1046 1047 if (ct_info->type == XctOtherCoding) { 1048 /* Terminate with an XctReturn sequence. */ 1049 ctptr[0] = XctESC; 1050 ctptr[1] = XctOtherCoding; 1051 ctptr[2] = '@'; 1052 ctptr += 3; 1053 } else if (ext_segment_start != NULL) { 1054 /* Backpatch the extended segment's length. */ 1055 int ext_segment_length = ctptr - ext_segment_start; 1056 *(ext_segment_start - 2) = (ext_segment_length >> 7) | 0x80; 1057 *(ext_segment_start - 1) = (ext_segment_length & 0x7f) | 0x80; 1058 } else { 1059 if (side == XlcGR || side == XlcGLGR) 1060 state->GR_charset = charset; 1061 if (side == XlcGL || side == XlcGLGR) 1062 state->GL_charset = charset; 1063 } 1064 1065 *from_left -= csptr - *((const char **) from); 1066 *from = (XPointer) csptr; 1067 1068 *to_left -= ctptr - *((char **) to); 1069 *to = (XPointer) ctptr; 1070 1071 return 0; 1072} 1073 1074/* from XlcNString to XlcNCharSet */ 1075 1076static int 1077strtocs( 1078 XlcConv conv, 1079 XPointer *from, 1080 int *from_left, 1081 XPointer *to, 1082 int *to_left, 1083 XPointer *args, 1084 int num_args) 1085{ 1086 State state = (State) conv->state; 1087 const char *src; 1088 char *dst; 1089 unsigned char side; 1090 int length; 1091 1092 src = (const char *) *from; 1093 dst = (char *) *to; 1094 1095 length = min(*from_left, *to_left); 1096 side = *((const unsigned char *) src) & 0x80; 1097 1098 while (side == (*((const unsigned char *) src) & 0x80) && length-- > 0) 1099 *dst++ = *src++; 1100 1101 *from_left -= src - (const char *) *from; 1102 *from = (XPointer) src; 1103 *to_left -= dst - (char *) *to; 1104 *to = (XPointer) dst; 1105 1106 if (num_args > 0) 1107 *((XlcCharSet *)args[0]) = (side ? state->GR_charset : state->GL_charset); 1108 1109 return 0; 1110} 1111 1112/* from XlcNCharSet to XlcNString */ 1113 1114static int 1115cstostr( 1116 XlcConv conv, 1117 XPointer *from, 1118 int *from_left, 1119 XPointer *to, 1120 int *to_left, 1121 XPointer *args, 1122 int num_args) 1123{ 1124 State state = (State) conv->state; 1125 const char *csptr; 1126 char *string_ptr; 1127 int csstr_len, str_len; 1128 unsigned char ch; 1129 int unconv_num = 0; 1130 1131 /* This converter can only convert from ISO8859-1:GL and ISO8859-1:GR. */ 1132 if (num_args < 1 1133 || !((XlcCharSet) args[0] == state->GL_charset 1134 || (XlcCharSet) args[0] == state->GR_charset)) 1135 return -1; 1136 1137 csptr = *((const char **) from); 1138 string_ptr = *((char **) to); 1139 csstr_len = *from_left; 1140 str_len = *to_left; 1141 1142 while (csstr_len > 0 && str_len > 0) { 1143 ch = *((const unsigned char *) csptr++); 1144 csstr_len--; 1145 /* Citing ICCCM: "STRING as a type specifies the ISO Latin-1 character 1146 set plus the control characters TAB and NEWLINE." */ 1147 if ((ch < 0x20 && ch != 0x00 && ch != 0x09 && ch != 0x0a) 1148 || (ch >= 0x7f && ch < 0xa0)) { 1149 unconv_num++; 1150 continue; 1151 } 1152 *((unsigned char *) string_ptr++) = ch; 1153 str_len--; 1154 } 1155 1156 *from_left -= csptr - *((const char **) from); 1157 *from = (XPointer) csptr; 1158 1159 *to_left -= string_ptr - *((char **) to); 1160 *to = (XPointer) string_ptr; 1161 1162 return unconv_num; 1163} 1164 1165 1166static XlcConv 1167create_conv( 1168 XlcConvMethods methods) 1169{ 1170 XlcConv conv; 1171 1172 conv = Xmalloc(sizeof(XlcConvRec) + sizeof(StateRec)); 1173 if (conv == NULL) 1174 return (XlcConv) NULL; 1175 1176 conv->state = (XPointer) &conv[1]; 1177 1178 conv->methods = methods; 1179 1180 init_state(conv); 1181 1182 return conv; 1183} 1184 1185static void 1186close_converter( 1187 XlcConv conv) 1188{ 1189 /* conv->state is allocated together with conv, free both at once. */ 1190 Xfree(conv); 1191} 1192 1193 1194static XlcConvMethodsRec cttocs_methods = { 1195 close_converter, 1196 cttocs, 1197 init_state 1198}; 1199 1200static XlcConv 1201open_cttocs( 1202 XLCd from_lcd, 1203 const char *from_type, 1204 XLCd to_lcd, 1205 const char *to_type) 1206{ 1207 return create_conv(&cttocs_methods); 1208} 1209 1210 1211static XlcConvMethodsRec cstoct_methods = { 1212 close_converter, 1213 cstoct, 1214 init_state 1215}; 1216 1217static XlcConv 1218open_cstoct( 1219 XLCd from_lcd, 1220 const char *from_type, 1221 XLCd to_lcd, 1222 const char *to_type) 1223{ 1224 return create_conv(&cstoct_methods); 1225} 1226 1227 1228static XlcConvMethodsRec strtocs_methods = { 1229 close_converter, 1230 strtocs, 1231 init_state 1232}; 1233 1234static XlcConv 1235open_strtocs( 1236 XLCd from_lcd, 1237 const char *from_type, 1238 XLCd to_lcd, 1239 const char *to_type) 1240{ 1241 return create_conv(&strtocs_methods); 1242} 1243 1244 1245static XlcConvMethodsRec cstostr_methods = { 1246 close_converter, 1247 cstostr, 1248 init_state 1249}; 1250 1251static XlcConv 1252open_cstostr( 1253 XLCd from_lcd, 1254 const char *from_type, 1255 XLCd to_lcd, 1256 const char *to_type) 1257{ 1258 return create_conv(&cstostr_methods); 1259} 1260 1261 1262/* =========================== Initialization =========================== */ 1263 1264Bool 1265_XlcInitCTInfo(void) 1266{ 1267 if (ct_list == NULL) { 1268 const CTDataRec *ct_data; 1269 int num; 1270 XlcCharSet charset; 1271 1272 /* Initialize ct_list. */ 1273 1274 num = sizeof(default_ct_data) / sizeof(CTDataRec); 1275 for (ct_data = default_ct_data; num > 0; ct_data++, num--) { 1276 charset = _XlcAddCT(ct_data->name, ct_data->ct_sequence); 1277 if (charset == NULL) 1278 continue; 1279 if (strncmp(charset->ct_sequence, "\x1b\x25\x2f", 3) != 0) 1280 charset->source = CSsrcStd; 1281 else 1282 charset->source = CSsrcXLC; 1283 } 1284 1285 /* Register CompoundText and CharSet converters. */ 1286 1287 _XlcSetConverter((XLCd) NULL, XlcNCompoundText, 1288 (XLCd) NULL, XlcNCharSet, 1289 open_cttocs); 1290 _XlcSetConverter((XLCd) NULL, XlcNString, 1291 (XLCd) NULL, XlcNCharSet, 1292 open_strtocs); 1293 1294 _XlcSetConverter((XLCd) NULL, XlcNCharSet, 1295 (XLCd) NULL, XlcNCompoundText, 1296 open_cstoct); 1297 _XlcSetConverter((XLCd) NULL, XlcNCharSet, 1298 (XLCd) NULL, XlcNString, 1299 open_cstostr); 1300 } 1301 1302 return True; 1303} 1304