lcCT.c revision 61b2299d
1/* $Xorg: lcCT.c,v 1.4 2000/08/17 19:45:16 cpqbld Exp $ */ 2/* 3 * Copyright 1992, 1993 by TOSHIBA Corp. 4 * 5 * Permission to use, copy, modify, and distribute this software and its 6 * documentation for any purpose and without fee is hereby granted, provided 7 * that the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of TOSHIBA not be used in advertising 10 * or publicity pertaining to distribution of the software without specific, 11 * written prior permission. TOSHIBA make no representations about the 12 * suitability of this software for any purpose. It is provided "as is" 13 * without express or implied warranty. 14 * 15 * TOSHIBA DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING 16 * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL 17 * TOSHIBA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR 18 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 19 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 20 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 21 * SOFTWARE. 22 * 23 * Author: Katsuhisa Yano TOSHIBA Corp. 24 * mopi@osa.ilab.toshiba.co.jp 25 */ 26/* 27 * Copyright 1995 by FUJITSU LIMITED 28 * This is source code modified by FUJITSU LIMITED under the Joint 29 * Development Agreement for the CDE/Motif PST. 30 * 31 * Modifier: Takanori Tateno FUJITSU LIMITED 32 * 33 */ 34/* 35 * 2000 36 * Modifier: Ivan Pascal The XFree86 Project 37 * Modifier: Bruno Haible The XFree86 Project 38 */ 39/* $XFree86: xc/lib/X11/lcCT.c,v 3.26 2001/10/28 03:32:34 tsi Exp $ */ 40 41#ifdef HAVE_CONFIG_H 42#include <config.h> 43#endif 44#include "Xlibint.h" 45#include "XlcPubI.h" 46#include <X11/Xos.h> 47#include <stdio.h> 48 49 50/* ====================== Built-in Character Sets ====================== */ 51 52/* 53 * Static representation of a character set that can be used in Compound Text. 54 */ 55typedef struct _CTDataRec { 56 const char name[19]; 57 const char ct_sequence[5]; /* Compound Text encoding, ESC sequence */ 58} CTDataRec, *CTData; 59 60static const CTDataRec default_ct_data[] = 61{ 62 /* */ 63 /* X11 registry name MIME name ISO-IR ESC sequence */ 64 /* */ 65 66 /* Registered character sets with one byte per character */ 67 { "ISO8859-1:GL", /* US-ASCII 6 */ "\033(B" }, 68 { "ISO8859-1:GR", /* ISO-8859-1 100 */ "\033-A" }, 69 { "ISO8859-2:GR", /* ISO-8859-2 101 */ "\033-B" }, 70 { "ISO8859-3:GR", /* ISO-8859-3 109 */ "\033-C" }, 71 { "ISO8859-4:GR", /* ISO-8859-4 110 */ "\033-D" }, 72 { "ISO8859-5:GR", /* ISO-8859-5 144 */ "\033-L" }, 73 { "ISO8859-6:GR", /* ISO-8859-6 127 */ "\033-G" }, 74 { "ISO8859-7:GR", /* ISO-8859-7 126 */ "\033-F" }, 75 { "ISO8859-8:GR", /* ISO-8859-8 138 */ "\033-H" }, 76 { "ISO8859-9:GR", /* ISO-8859-9 148 */ "\033-M" }, 77 { "ISO8859-10:GR", /* ISO-8859-10 157 */ "\033-V" }, 78 { "ISO8859-11:GR", /* ISO-8859-11 166 */ "\033-T" }, 79 { "ISO8859-13:GR", /* ISO-8859-13 179 */ "\033-Y" }, 80 { "ISO8859-14:GR", /* ISO-8859-14 199 */ "\033-_" }, 81 { "ISO8859-15:GR", /* ISO-8859-15 203 */ "\033-b" }, 82 { "ISO8859-16:GR", /* ISO-8859-16 226 */ "\033-f" }, 83 { "JISX0201.1976-0:GL", /* ISO-646-JP 14 */ "\033(J" }, 84 { "JISX0201.1976-0:GR", "\033)I" }, 85#if 0 86 { "TIS620-0:GR", /* TIS-620 166 */ "\033-T" }, 87#endif 88 89 /* Registered character sets with two byte per character */ 90 { "GB2312.1980-0:GL", /* GB_2312-80 58 */ "\033$(A" }, 91 { "GB2312.1980-0:GR", /* GB_2312-80 58 */ "\033$)A" }, 92 { "JISX0208.1983-0:GL", /* JIS_X0208-1983 87 */ "\033$(B" }, 93 { "JISX0208.1983-0:GR", /* JIS_X0208-1983 87 */ "\033$)B" }, 94 { "JISX0208.1990-0:GL", /* JIS_X0208-1990 168 */ "\033$(B" }, 95 { "JISX0208.1990-0:GR", /* JIS_X0208-1990 168 */ "\033$)B" }, 96 { "JISX0212.1990-0:GL", /* JIS_X0212-1990 159 */ "\033$(D" }, 97 { "JISX0212.1990-0:GR", /* JIS_X0212-1990 159 */ "\033$)D" }, 98 { "KSC5601.1987-0:GL", /* KS_C_5601-1987 149 */ "\033$(C" }, 99 { "KSC5601.1987-0:GR", /* KS_C_5601-1987 149 */ "\033$)C" }, 100 { "CNS11643.1986-1:GL", /* CNS 11643-1992 pl.1 171 */ "\033$(G" }, 101 { "CNS11643.1986-1:GR", /* CNS 11643-1992 pl.1 171 */ "\033$)G" }, 102 { "CNS11643.1986-2:GL", /* CNS 11643-1992 pl.2 172 */ "\033$(H" }, 103 { "CNS11643.1986-2:GR", /* CNS 11643-1992 pl.2 172 */ "\033$)H" }, 104 { "CNS11643.1992-3:GL", /* CNS 11643-1992 pl.3 183 */ "\033$(I" }, 105 { "CNS11643.1992-3:GR", /* CNS 11643-1992 pl.3 183 */ "\033$)I" }, 106 { "CNS11643.1992-4:GL", /* CNS 11643-1992 pl.4 184 */ "\033$(J" }, 107 { "CNS11643.1992-4:GR", /* CNS 11643-1992 pl.4 184 */ "\033$)J" }, 108 { "CNS11643.1992-5:GL", /* CNS 11643-1992 pl.5 185 */ "\033$(K" }, 109 { "CNS11643.1992-5:GR", /* CNS 11643-1992 pl.5 185 */ "\033$)K" }, 110 { "CNS11643.1992-6:GL", /* CNS 11643-1992 pl.6 186 */ "\033$(L" }, 111 { "CNS11643.1992-6:GR", /* CNS 11643-1992 pl.6 186 */ "\033$)L" }, 112 { "CNS11643.1992-7:GL", /* CNS 11643-1992 pl.7 187 */ "\033$(M" }, 113 { "CNS11643.1992-7:GR", /* CNS 11643-1992 pl.7 187 */ "\033$)M" }, 114 115 /* Registered encodings with a varying number of bytes per character */ 116 { "ISO10646-1", /* UTF-8 196 */ "\033%G" }, 117 118 /* Encodings without ISO-IR assigned escape sequence must be 119 defined in XLC_LOCALE files, using "\033%/1" or "\033%/2". */ 120 121 /* Backward compatibility with XFree86 3.x */ 122#if 1 123 { "ISO8859-14:GR", "\033%/1" }, 124 { "ISO8859-15:GR", "\033%/1" }, 125#endif 126 /* For use by utf8 -> ctext */ 127 { "BIG5-0:GLGR", "\033%/2"}, 128 { "BIG5HKSCS-0:GLGR", "\033%/2"}, 129 { "GBK-0:GLGR", "\033%/2"}, 130 /* used by Emacs, but not backed by ISO-IR */ 131 { "BIG5-E0:GL", "\033$(0" }, 132 { "BIG5-E0:GR", "\033$)0" }, 133 { "BIG5-E1:GL", "\033$(1" }, 134 { "BIG5-E1:GR", "\033$)1" }, 135 136}; 137 138/* We represent UTF-8 as an XlcGLGR charset, not in extended segments. */ 139#define UTF8_IN_EXTSEQ 0 140 141/* ======================= Parsing ESC Sequences ======================= */ 142 143#define XctC0 0x0000 144#define XctHT 0x0009 145#define XctNL 0x000a 146#define XctESC 0x001b 147#define XctGL 0x0020 148#define XctC1 0x0080 149#define XctCSI 0x009b 150#define XctGR 0x00a0 151#define XctSTX 0x0002 152 153#define XctCntrlFunc 0x0023 154#define XctMB 0x0024 155#define XctOtherCoding 0x0025 156#define XctGL94 0x0028 157#define XctGR94 0x0029 158#define XctGR96 0x002d 159#define XctNonStandard 0x002f 160#define XctIgnoreExt 0x0030 161#define XctNotIgnoreExt 0x0031 162#define XctLeftToRight 0x0031 163#define XctRightToLeft 0x0032 164#define XctDirection 0x005d 165#define XctDirectionEnd 0x005d 166 167#define XctGL94MB 0x2428 168#define XctGR94MB 0x2429 169#define XctExtSeg 0x252f 170#define XctReturn 0x2540 171 172/* 173 * Parses the header of a Compound Text segment, i.e. the charset designator. 174 * The string starts at *text and has *length bytes. 175 * Return value is one of: 176 * 0 (no valid charset designator), 177 * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, 178 * XctLeftToRight, XctRightToLeft, XctDirectionEnd, 179 * XctExtSeg, XctOtherCoding, XctReturn, XctIgnoreExt, XctNotIgnoreExt. 180 * If the return value is not 0, *text is incremented and *length decremented, 181 * to point past the charset designator. If the return value is one of 182 * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, 183 * XctExtSeg, XctOtherCoding, XctIgnoreExt, XctNotIgnoreExt, 184 * *final_byte is set to the "final byte" of the charset designator. 185 */ 186static unsigned int 187_XlcParseCT( 188 const char **text, 189 int *length, 190 unsigned char *final_byte) 191{ 192 unsigned int ret = 0; 193 unsigned char ch; 194 const unsigned char *str = (const unsigned char *) *text; 195 196 *final_byte = 0; 197 198 if (*length < 1) 199 return 0; 200 switch (ch = *str++) { 201 case XctESC: 202 if (*length < 2) 203 return 0; 204 switch (ch = *str++) { 205 case XctOtherCoding: /* % */ 206 if (*length < 3) 207 return 0; 208 ch = *str++; 209 if (ch == XctNonStandard) { /* / */ 210 if (*length < 4) 211 return 0; 212 ret = XctExtSeg; 213 ch = *str++; 214 } else if (ch == '@') { 215 ret = XctReturn; 216 } else { 217 ret = XctOtherCoding; 218 } 219 *final_byte = ch; 220 break; 221 222 case XctCntrlFunc: /* # */ 223 if (*length < 4) 224 return 0; 225 *final_byte = *str++; 226 switch (*str++) { 227 case XctIgnoreExt: /* 0 */ 228 ret = XctIgnoreExt; 229 break; 230 case XctNotIgnoreExt: /* 1 */ 231 ret = XctNotIgnoreExt; 232 break; 233 default: 234 ret = 0; 235 break; 236 } 237 break; 238 239 case XctMB: /* $ */ 240 if (*length < 4) 241 return 0; 242 ch = *str++; 243 switch (ch) { 244 case XctGL94: /* ( */ 245 ret = XctGL94MB; 246 break; 247 case XctGR94: /* ) */ 248 ret = XctGR94MB; 249 break; 250 default: 251 ret = 0; 252 break; 253 } 254 *final_byte = *str++; 255 break; 256 257 case XctGL94: /* ( */ 258 if (*length < 3) 259 return 0; 260 ret = XctGL94; 261 *final_byte = *str++; 262 break; 263 case XctGR94: /* ) */ 264 if (*length < 3) 265 return 0; 266 ret = XctGR94; 267 *final_byte = *str++; 268 break; 269 case XctGR96: /* - */ 270 if (*length < 3) 271 return 0; 272 ret = XctGR96; 273 *final_byte = *str++; 274 break; 275 } 276 break; 277 case XctCSI: 278 /* direction */ 279 if (*length < 2) 280 return 0; 281 switch (*str++) { 282 case XctLeftToRight: 283 if (*length < 3) 284 return 0; 285 if (*str++ == XctDirection) 286 ret = XctLeftToRight; 287 break; 288 case XctRightToLeft: 289 if (*length < 3) 290 return 0; 291 if (*str++ == XctDirection) 292 ret = XctRightToLeft; 293 break; 294 case XctDirectionEnd: 295 ret = XctDirectionEnd; 296 break; 297 } 298 break; 299 } 300 301 if (ret) { 302 *length -= (const char *) str - *text; 303 *text = (const char *) str; 304 } 305 return ret; 306} 307 308/* 309 * Fills into a freshly created XlcCharSet the fields that can be inferred 310 * from the ESC sequence. These are side, char_size, set_size. 311 * Returns True if the charset can be used with Compound Text. 312 * 313 * Used by _XlcCreateDefaultCharSet. 314 */ 315Bool 316_XlcParseCharSet( 317 XlcCharSet charset) 318{ 319 unsigned int type; 320 unsigned char final_byte; 321 const char *ptr = charset->ct_sequence; 322 int length; 323 int char_size; 324 325 if (*ptr == '\0') 326 return False; 327 328 length = strlen(ptr); 329 330 type = _XlcParseCT(&ptr, &length, &final_byte); 331 332 /* Check for validity and determine char_size. 333 char_size = 0 means varying number of bytes per character. */ 334 switch (type) { 335 case XctGL94: 336 case XctGR94: 337 case XctGR96: 338 char_size = 1; 339 break; 340 case XctGL94MB: 341 case XctGR94MB: 342 char_size = (final_byte < 0x60 ? 2 : final_byte < 0x70 ? 3 : 4); 343 break; 344 case XctExtSeg: 345 char_size = final_byte - '0'; 346 if (!(char_size >= 0 && char_size <= 4)) 347 return False; 348 break; 349 case XctOtherCoding: 350 char_size = 0; 351 break; 352 default: 353 return False; 354 } 355 356 charset->char_size = char_size; 357 358 /* Fill in other values. */ 359 switch (type) { 360 case XctGL94: 361 case XctGL94MB: 362 charset->side = XlcGL; 363 charset->set_size = 94; 364 break; 365 case XctGR94: 366 case XctGR94MB: 367 charset->side = XlcGR; 368 charset->set_size = 94; 369 break; 370 case XctGR96: 371 charset->side = XlcGR; 372 charset->set_size = 96; 373 break; 374 case XctExtSeg: 375 case XctOtherCoding: 376 charset->side = XlcGLGR; 377 charset->set_size = 0; 378 break; 379 } 380 return True; 381} 382 383 384/* =============== Management of the List of Character Sets =============== */ 385 386/* 387 * Representation of a character set that can be used for Compound Text, 388 * at run time. 389 * Note: This information is not contained in the XlcCharSet, because 390 * multiple ESC sequences may be used for the same XlcCharSet. 391 */ 392typedef struct _CTInfoRec { 393 XlcCharSet charset; 394 const char *ct_sequence; /* Compound Text ESC sequence */ 395 unsigned int type; 396 unsigned char final_byte; 397 /* If type == XctExtSeg: */ 398 const char *ext_segment; /* extended segment name, then '\002' */ 399 int ext_segment_len; /* length of above, including final '\002' */ 400 401 struct _CTInfoRec *next; 402} CTInfoRec, *CTInfo; 403 404/* 405 * List of character sets that can be used for Compound Text, 406 * Includes all that are listed in default_ct_data, but more can be added 407 * at runtime through _XlcAddCT. 408 */ 409static CTInfo ct_list = NULL; 410static CTInfo ct_list_end = NULL; 411 412/* 413 * Returns a Compound Text info record for an ESC sequence. 414 * The first part of the ESC sequence has already been parsed into 'type' 415 * and 'final_byte'. The remainder starts at 'text', at least 'text_len' 416 * bytes (only used if type == XctExtSeg). 417 */ 418static CTInfo 419_XlcGetCTInfo( 420 unsigned int type, 421 unsigned char final_byte, 422 const char *text, 423 int text_len) 424{ 425 CTInfo ct_info; 426 427 for (ct_info = ct_list; ct_info; ct_info = ct_info->next) 428 if (ct_info->type == type 429 && ct_info->final_byte == final_byte 430 && (type != XctExtSeg 431 || (text_len >= ct_info->ext_segment_len 432 && memcmp(text, ct_info->ext_segment, 433 ct_info->ext_segment_len) == 0))) 434 return ct_info; 435 436 return (CTInfo) NULL; 437} 438 439/* Returns the Compound Text info for a given XlcCharSet. 440 Returns NULL if none is found. */ 441static CTInfo 442_XlcGetCTInfoFromCharSet( 443 XlcCharSet charset) 444{ 445 CTInfo ct_info; 446 447 for (ct_info = ct_list; ct_info; ct_info = ct_info->next) 448 if (ct_info->charset == charset) 449 return ct_info; 450 451 return (CTInfo) NULL; 452} 453 454/* Creates a new XlcCharSet, given its name (including side suffix) and 455 Compound Text ESC sequence (normally at most 4 bytes), and makes it 456 eligible for Compound Text processing. */ 457XlcCharSet 458_XlcAddCT( 459 const char *name, 460 const char *ct_sequence) 461{ 462 CTInfo ct_info, existing_info; 463 XlcCharSet charset; 464 const char *ct_ptr; 465 int length; 466 unsigned int type; 467 unsigned char final_byte; 468 469 charset = _XlcGetCharSet(name); 470 if (charset != NULL) { 471 /* Even if the charset already exists, it is OK to register a second 472 Compound Text sequence for it. */ 473 } else { 474 /* Attempt to create the charset. */ 475 charset = _XlcCreateDefaultCharSet(name, ct_sequence); 476 if (charset == NULL) 477 return (XlcCharSet) NULL; 478 _XlcAddCharSet(charset); 479 } 480 481 /* Allocate a CTinfo record. */ 482 length = strlen(ct_sequence); 483 ct_info = (CTInfo) Xmalloc(sizeof(CTInfoRec) + length+1); 484 if (ct_info == NULL) 485 return charset; 486 487 ct_info->charset = charset; 488 ct_info->ct_sequence = strcpy((char *) (ct_info + 1), ct_sequence); 489 490 /* Parse the Compound Text sequence. */ 491 ct_ptr = ct_sequence; 492 type = _XlcParseCT(&ct_ptr, &length, &final_byte); 493 494 ct_info->type = type; 495 ct_info->final_byte = final_byte; 496 497 switch (type) { 498 case XctGL94: 499 case XctGR94: 500 case XctGR96: 501 case XctGL94MB: 502 case XctGR94MB: 503 case XctOtherCoding: 504 ct_info->ext_segment = NULL; 505 ct_info->ext_segment_len = 0; 506 break; 507 case XctExtSeg: { 508 /* By convention, the extended segment name is the encoding_name 509 in lowercase. */ 510 const char *q = charset->encoding_name; 511 int n = strlen(q); 512 char *p; 513 514 /* Ensure ct_info->ext_segment_len <= 0x3fff - 6. */ 515 if (n > 0x3fff - 6 - 1) { 516 Xfree(ct_info); 517 return charset; 518 } 519 p = (char *) Xmalloc(n+1); 520 if (p == NULL) { 521 Xfree(ct_info); 522 return charset; 523 } 524 ct_info->ext_segment = p; 525 ct_info->ext_segment_len = n+1; 526 for ( ; n > 0; p++, q++, n--) 527 *p = (*q >= 'A' && *q <= 'Z' ? *q - 'A' + 'a' : *q); 528 *p = XctSTX; 529 break; 530 } 531 default: 532 Xfree(ct_info); 533 return (XlcCharSet) NULL; 534 } 535 536 /* Insert it into the list, if not already present. */ 537 existing_info = 538 _XlcGetCTInfo(type, ct_info->final_byte, 539 ct_info->ext_segment, ct_info->ext_segment_len); 540 if (existing_info == NULL) { 541 /* Insert it at the end. If there are duplicates CTinfo entries 542 for the same XlcCharSet, we want the first (standard) one to 543 override the second (user defined) one. */ 544 ct_info->next = NULL; 545 if (ct_list_end) 546 ct_list_end->next = ct_info; 547 else 548 ct_list = ct_info; 549 ct_list_end = ct_info; 550 } else { 551 if (existing_info->charset != charset 552 /* We have a conflict, with one exception: JISX0208.1983-0 and 553 JISX0208.1990-0 are the same for all practical purposes. */ 554 && !(strncmp(existing_info->charset->name, "JISX0208", 8) == 0 555 && strncmp(charset->name, "JISX0208", 8) == 0)) { 556 fprintf(stderr, 557 "Xlib: charsets %s and %s have the same CT sequence\n", 558 charset->name, existing_info->charset->name); 559 if (strcmp(charset->ct_sequence, ct_sequence) == 0) 560 charset->ct_sequence = ""; 561 } 562 Xfree(ct_info); 563 } 564 565 return charset; 566} 567 568 569/* ========== Converters String <--> CharSet <--> Compound Text ========== */ 570 571/* 572 * Structure representing the parse state of a Compound Text string. 573 */ 574typedef struct _StateRec { 575 XlcCharSet charset; /* The charset of the current segment */ 576 XlcCharSet GL_charset; /* The charset responsible for 0x00..0x7F */ 577 XlcCharSet GR_charset; /* The charset responsible for 0x80..0xFF */ 578 XlcCharSet Other_charset; /* != NULL if currently in an other segment */ 579 int ext_seg_left; /* > 0 if currently in an extended segment */ 580} StateRec, *State; 581 582 583/* Subroutine for parsing an ESC sequence. */ 584 585typedef enum { 586 resOK, /* Charset saved in 'state', sequence skipped */ 587 resNotInList, /* Charset not found, sequence skipped */ 588 resNotCTSeq /* EscSeq not recognized, pointers not changed */ 589} CheckResult; 590 591static CheckResult 592_XlcCheckCTSequence( 593 State state, 594 const char **ctext, 595 int *ctext_len) 596{ 597 XlcCharSet charset; 598 CTInfo ct_info; 599 const char *tmp_ctext = *ctext; 600 int tmp_ctext_len = *ctext_len; 601 unsigned int type; 602 unsigned char final_byte; 603 int ext_seg_left = 0; 604 605 /* Check for validity. */ 606 type = _XlcParseCT(&tmp_ctext, &tmp_ctext_len, &final_byte); 607 608 switch (type) { 609 case XctGL94: 610 case XctGR94: 611 case XctGR96: 612 case XctGL94MB: 613 case XctGR94MB: 614 case XctOtherCoding: 615 *ctext = tmp_ctext; 616 *ctext_len = tmp_ctext_len; 617 break; 618 case XctReturn: 619 *ctext = tmp_ctext; 620 *ctext_len = tmp_ctext_len; 621 state->Other_charset = NULL; 622 return resOK; 623 case XctExtSeg: 624 if (tmp_ctext_len > 2 625 && (tmp_ctext[0] & 0x80) && (tmp_ctext[0] & 0x80)) { 626 unsigned int msb = tmp_ctext[0] & 0x7f; 627 unsigned int lsb = tmp_ctext[1] & 0x7f; 628 ext_seg_left = (msb << 7) + lsb; 629 if (ext_seg_left <= tmp_ctext_len - 2) { 630 *ctext = tmp_ctext + 2; 631 *ctext_len = tmp_ctext_len - 2; 632 break; 633 } 634 } 635 return resNotCTSeq; 636 default: 637 return resNotCTSeq; 638 } 639 640 ct_info = _XlcGetCTInfo(type, final_byte, *ctext, ext_seg_left); 641 642 if (ct_info) { 643 charset = ct_info->charset; 644 state->ext_seg_left = ext_seg_left; 645 if (type == XctExtSeg) { 646 state->charset = charset; 647 /* Skip past the extended segment name and the separator. */ 648 *ctext += ct_info->ext_segment_len; 649 *ctext_len -= ct_info->ext_segment_len; 650 state->ext_seg_left -= ct_info->ext_segment_len; 651 } else if (type == XctOtherCoding) { 652 state->Other_charset = charset; 653 } else { 654 if (charset->side == XlcGL) { 655 state->GL_charset = charset; 656 } else if (charset->side == XlcGR) { 657 state->GR_charset = charset; 658 } else { 659 state->GL_charset = charset; 660 state->GR_charset = charset; 661 } 662 } 663 return resOK; 664 } else { 665 state->ext_seg_left = 0; 666 if (type == XctExtSeg) { 667 /* Skip the entire extended segment. */ 668 *ctext += ext_seg_left; 669 *ctext_len -= ext_seg_left; 670 } 671 return resNotInList; 672 } 673} 674 675static void 676init_state( 677 XlcConv conv) 678{ 679 State state = (State) conv->state; 680 static XlcCharSet default_GL_charset = NULL; 681 static XlcCharSet default_GR_charset = NULL; 682 683 if (default_GL_charset == NULL) { 684 default_GL_charset = _XlcGetCharSet("ISO8859-1:GL"); 685 default_GR_charset = _XlcGetCharSet("ISO8859-1:GR"); 686 } 687 688 /* The initial state is ISO-8859-1 on both sides. */ 689 state->GL_charset = state->charset = default_GL_charset; 690 state->GR_charset = default_GR_charset; 691 692 state->Other_charset = NULL; 693 694 state->ext_seg_left = 0; 695} 696 697/* from XlcNCompoundText to XlcNCharSet */ 698 699static int 700cttocs( 701 XlcConv conv, 702 XPointer *from, 703 int *from_left, 704 XPointer *to, 705 int *to_left, 706 XPointer *args, 707 int num_args) 708{ 709 State state = (State) conv->state; 710 XlcCharSet charset = NULL; 711 const char *ctptr; 712 char *bufptr; 713 int ctext_len, buf_len; 714 int unconv_num = 0; 715 716 ctptr = (const char *) *from; 717 bufptr = (char *) *to; 718 ctext_len = *from_left; 719 buf_len = *to_left; 720 721 while (ctext_len > 0 && buf_len > 0) { 722 if (state->ext_seg_left == 0) { 723 /* Not in the middle of an extended segment; look at next byte. */ 724 unsigned char ch = *ctptr; 725 XlcCharSet ch_charset; 726 727 if (ch == XctESC) { 728 CheckResult ret = 729 _XlcCheckCTSequence(state, &ctptr, &ctext_len); 730 if (ret == resOK) 731 /* state has been modified. */ 732 continue; 733 if (ret == resNotInList) { 734 /* XXX Just continue with previous charset. */ 735 unconv_num++; 736 continue; 737 } 738 } else if (ch == XctCSI) { 739 /* XXX Simply ignore the XctLeftToRight, XctRightToLeft, 740 XctDirectionEnd sequences for the moment. */ 741 unsigned char dummy; 742 if (_XlcParseCT(&ctptr, &ctext_len, &dummy)) { 743 unconv_num++; 744 continue; 745 } 746 } 747 748 /* Find the charset which is responsible for this byte. */ 749 ch_charset = (state->Other_charset != NULL ? state->Other_charset : 750 (ch & 0x80 ? state->GR_charset : state->GL_charset)); 751 752 /* Set the charset of this run, or continue the current run, 753 or stop the current run. */ 754 if (charset) { 755 if (charset != ch_charset) 756 break; 757 } else { 758 state->charset = charset = ch_charset; 759 } 760 761 /* We don't want to split a character into multiple pieces. */ 762 if (buf_len < 6) { 763 if (charset->char_size > 0) { 764 if (buf_len < charset->char_size) 765 break; 766 } else { 767 /* char_size == 0 is tricky. The code here is good only 768 for valid UTF-8 input. */ 769 if (charset->ct_sequence[0] == XctESC 770 && charset->ct_sequence[1] == XctOtherCoding 771 && charset->ct_sequence[2] == 'G') { 772 int char_size = (ch < 0xc0 ? 1 : 773 ch < 0xe0 ? 2 : 774 ch < 0xf0 ? 3 : 775 ch < 0xf8 ? 4 : 776 ch < 0xfc ? 5 : 777 6); 778 if (buf_len < char_size) 779 break; 780 } 781 } 782 } 783 784 *bufptr++ = *ctptr++; 785 ctext_len--; 786 buf_len--; 787 } else { 788 /* Copy as much as possible from the current extended segment 789 to the buffer. */ 790 int char_size; 791 792 /* Set the charset of this run, or continue the current run, 793 or stop the current run. */ 794 if (charset) { 795 if (charset != state->charset) 796 break; 797 } else { 798 charset = state->charset; 799 } 800 801 char_size = charset->char_size; 802 803 if (state->ext_seg_left <= buf_len || char_size > 0) { 804 int n = (state->ext_seg_left <= buf_len 805 ? state->ext_seg_left 806 : (buf_len / char_size) * char_size); 807 memcpy(bufptr, ctptr, n); 808 ctptr += n; ctext_len -= n; 809 bufptr += n; buf_len -= n; 810 state->ext_seg_left -= n; 811 } else { 812#if UTF8_IN_EXTSEQ 813 /* char_size == 0 is tricky. The code here is good only 814 for valid UTF-8 input. */ 815 if (strcmp(charset->name, "ISO10646-1") == 0) { 816 unsigned char ch = *ctptr; 817 int char_size = (ch < 0xc0 ? 1 : 818 ch < 0xe0 ? 2 : 819 ch < 0xf0 ? 3 : 820 ch < 0xf8 ? 4 : 821 ch < 0xfc ? 5 : 822 6); 823 int i; 824 if (buf_len < char_size) 825 break; 826 /* A small loop is faster than calling memcpy. */ 827 for (i = char_size; i > 0; i--) 828 *bufptr++ = *ctptr++; 829 ctext_len -= char_size; 830 buf_len -= char_size; 831 state->ext_seg_left -= char_size; 832 } else 833#endif 834 { 835 /* Here ctext_len >= state->ext_seg_left > buf_len. 836 We may be splitting a character into multiple pieces. 837 Oh well. */ 838 int n = buf_len; 839 memcpy(bufptr, ctptr, n); 840 ctptr += n; ctext_len -= n; 841 bufptr += n; buf_len -= n; 842 state->ext_seg_left -= n; 843 } 844 } 845 } 846 } 847 848 /* 'charset' is the charset for the current run. In some cases, 849 'state->charset' contains the charset for the next run. Therefore, 850 return 'charset'. 851 'charset' may still be NULL only if no output was produced. */ 852 if (num_args > 0) 853 *((XlcCharSet *) args[0]) = charset; 854 855 *from_left -= ctptr - *((const char **) from); 856 *from = (XPointer) ctptr; 857 858 *to_left -= bufptr - *((char **) to); 859 *to = (XPointer) bufptr; 860 861 return unconv_num; 862} 863 864/* from XlcNCharSet to XlcNCompoundText */ 865 866static int 867cstoct( 868 XlcConv conv, 869 XPointer *from, 870 int *from_left, 871 XPointer *to, 872 int *to_left, 873 XPointer *args, 874 int num_args) 875{ 876 State state = (State) conv->state; 877 XlcSide side; 878 unsigned char min_ch = 0, max_ch = 0; 879 int length, unconv_num; 880 CTInfo ct_info; 881 XlcCharSet charset; 882 const char *csptr; 883 char *ctptr; 884 int csstr_len, ct_len; 885 char *ext_segment_start; 886 int char_size; 887 888 /* One argument is required, of type XlcCharSet. */ 889 if (num_args < 1) 890 return -1; 891 892 csptr = *((const char **) from); 893 ctptr = *((char **) to); 894 csstr_len = *from_left; 895 ct_len = *to_left; 896 897 charset = (XlcCharSet) args[0]; 898 899 ct_info = _XlcGetCTInfoFromCharSet(charset); 900 if (ct_info == NULL) 901 return -1; 902 903 side = charset->side; 904 length = strlen(ct_info->ct_sequence); 905 906 ext_segment_start = NULL; 907 908 if (ct_info->type == XctOtherCoding) { 909 /* Output the Escape sequence for switching to the charset, and 910 reserve room now for the XctReturn sequence at the end. */ 911 if (ct_len < length + 3) 912 return -1; 913 914 memcpy(ctptr, ct_info->ct_sequence, length); 915 ctptr += length; 916 ct_len -= length + 3; 917 } else 918 /* Test whether the charset is already active. */ 919 if (((side == XlcGR || side == XlcGLGR) 920 && charset != state->GR_charset) 921 || ((side == XlcGL || side == XlcGLGR) 922 && charset != state->GL_charset)) { 923 924 /* Output the Escape sequence for switching to the charset. */ 925 if (ct_info->type == XctExtSeg) { 926 if (ct_len < length + 2 + ct_info->ext_segment_len) 927 return -1; 928 929 memcpy(ctptr, ct_info->ct_sequence, length); 930 ctptr += length; 931 ct_len -= length; 932 933 ctptr += 2; 934 ct_len -= 2; 935 ext_segment_start = ctptr; 936 937 /* The size of an extended segment must fit in 14 bits. */ 938 if (ct_len > 0x3fff) 939 ct_len = 0x3fff; 940 941 memcpy(ctptr, ct_info->ext_segment, ct_info->ext_segment_len); 942 ctptr += ct_info->ext_segment_len; 943 ct_len -= ct_info->ext_segment_len; 944 } else { 945 if (ct_len < length) 946 return -1; 947 948 memcpy(ctptr, ct_info->ct_sequence, length); 949 ctptr += length; 950 ct_len -= length; 951 } 952 } 953 954 /* If the charset has side GL or GR, prepare remapping the characters 955 to the correct side. */ 956 if (charset->set_size) { 957 min_ch = 0x20; 958 max_ch = 0x7f; 959 if (charset->set_size == 94) { 960 max_ch--; 961 if (charset->char_size > 1 || side == XlcGR) 962 min_ch++; 963 } 964 } 965 966 /* Actually copy the contents. */ 967 unconv_num = 0; 968 char_size = charset->char_size; 969 if (char_size == 1) { 970 while (csstr_len > 0 && ct_len > 0) { 971 if (charset->set_size) { 972 /* The CompoundText specification says that the only 973 control characters allowed are 0x09, 0x0a, 0x1b, 0x9b. 974 Therefore here we eliminate other control characters. */ 975 unsigned char ch = *((unsigned char *) csptr) & 0x7f; 976 if (!((ch >= min_ch && ch <= max_ch) 977 || (side == XlcGL 978 && (ch == 0x00 || ch == 0x09 || ch == 0x0a)) 979 || ((side == XlcGL || side == XlcGR) 980 && (ch == 0x1b)))) { 981 csptr++; 982 csstr_len--; 983 unconv_num++; 984 continue; 985 } 986 } 987 988 if (side == XlcGL) 989 *ctptr++ = *csptr++ & 0x7f; 990 else if (side == XlcGR) 991 *ctptr++ = *csptr++ | 0x80; 992 else 993 *ctptr++ = *csptr++; 994 csstr_len--; 995 ct_len--; 996 } 997 } else if (char_size > 1) { 998 while (csstr_len >= char_size && ct_len >= char_size) { 999 if (side == XlcGL) { 1000 int i; 1001 for (i = char_size; i > 0; i--) 1002 *ctptr++ = *csptr++ & 0x7f; 1003 } else if (side == XlcGR) { 1004 int i; 1005 for (i = char_size; i > 0; i--) 1006 *ctptr++ = *csptr++ | 0x80; 1007 } else { 1008 int i; 1009 for (i = char_size; i > 0; i--) 1010 *ctptr++ = *csptr++; 1011 } 1012 csstr_len -= char_size; 1013 ct_len -= char_size; 1014 } 1015 } else { 1016 /* char_size = 0. The code here is good only for valid UTF-8 input. */ 1017 if ((charset->ct_sequence[0] == XctESC 1018 && charset->ct_sequence[1] == XctOtherCoding 1019 && charset->ct_sequence[2] == 'G') 1020#if UTF8_IN_EXTSEQ 1021 || strcmp(charset->name, "ISO10646-1") == 0 1022#endif 1023 ) { 1024 while (csstr_len > 0 && ct_len > 0) { 1025 unsigned char ch = * (unsigned char *) csptr; 1026 int char_size = (ch < 0xc0 ? 1 : 1027 ch < 0xe0 ? 2 : 1028 ch < 0xf0 ? 3 : 1029 ch < 0xf8 ? 4 : 1030 ch < 0xfc ? 5 : 1031 6); 1032 int i; 1033 if (!(csstr_len >= char_size && ct_len >= char_size)) 1034 break; 1035 for (i = char_size; i > 0; i--) 1036 *ctptr++ = *csptr++; 1037 csstr_len -= char_size; 1038 ct_len -= char_size; 1039 } 1040 } else { 1041 while (csstr_len > 0 && ct_len > 0) { 1042 *ctptr++ = *csptr++; 1043 csstr_len--; 1044 ct_len--; 1045 } 1046 } 1047 } 1048 1049 if (ct_info->type == XctOtherCoding) { 1050 /* Terminate with an XctReturn sequence. */ 1051 ctptr[0] = XctESC; 1052 ctptr[1] = XctOtherCoding; 1053 ctptr[2] = '@'; 1054 ctptr += 3; 1055 } else if (ext_segment_start != NULL) { 1056 /* Backpatch the extended segment's length. */ 1057 int ext_segment_length = ctptr - ext_segment_start; 1058 *(ext_segment_start - 2) = (ext_segment_length >> 7) | 0x80; 1059 *(ext_segment_start - 1) = (ext_segment_length & 0x7f) | 0x80; 1060 } else { 1061 if (side == XlcGR || side == XlcGLGR) 1062 state->GR_charset = charset; 1063 if (side == XlcGL || side == XlcGLGR) 1064 state->GL_charset = charset; 1065 } 1066 1067 *from_left -= csptr - *((const char **) from); 1068 *from = (XPointer) csptr; 1069 1070 *to_left -= ctptr - *((char **) to); 1071 *to = (XPointer) ctptr; 1072 1073 return 0; 1074} 1075 1076/* from XlcNString to XlcNCharSet */ 1077 1078static int 1079strtocs( 1080 XlcConv conv, 1081 XPointer *from, 1082 int *from_left, 1083 XPointer *to, 1084 int *to_left, 1085 XPointer *args, 1086 int num_args) 1087{ 1088 State state = (State) conv->state; 1089 const char *src; 1090 char *dst; 1091 unsigned char side; 1092 int length; 1093 1094 src = (const char *) *from; 1095 dst = (char *) *to; 1096 1097 length = min(*from_left, *to_left); 1098 side = *((unsigned char *) src) & 0x80; 1099 1100 while (side == (*((unsigned char *) src) & 0x80) && length-- > 0) 1101 *dst++ = *src++; 1102 1103 *from_left -= src - (const char *) *from; 1104 *from = (XPointer) src; 1105 *to_left -= dst - (char *) *to; 1106 *to = (XPointer) dst; 1107 1108 if (num_args > 0) 1109 *((XlcCharSet *)args[0]) = (side ? state->GR_charset : state->GL_charset); 1110 1111 return 0; 1112} 1113 1114/* from XlcNCharSet to XlcNString */ 1115 1116static int 1117cstostr( 1118 XlcConv conv, 1119 XPointer *from, 1120 int *from_left, 1121 XPointer *to, 1122 int *to_left, 1123 XPointer *args, 1124 int num_args) 1125{ 1126 State state = (State) conv->state; 1127 const char *csptr; 1128 char *string_ptr; 1129 int csstr_len, str_len; 1130 unsigned char ch; 1131 int unconv_num = 0; 1132 1133 /* This converter can only convert from ISO8859-1:GL and ISO8859-1:GR. */ 1134 if (num_args < 1 1135 || !((XlcCharSet) args[0] == state->GL_charset 1136 || (XlcCharSet) args[0] == state->GR_charset)) 1137 return -1; 1138 1139 csptr = *((const char **) from); 1140 string_ptr = *((char **) to); 1141 csstr_len = *from_left; 1142 str_len = *to_left; 1143 1144 while (csstr_len > 0 && str_len > 0) { 1145 ch = *((unsigned char *) csptr++); 1146 csstr_len--; 1147 /* Citing ICCCM: "STRING as a type specifies the ISO Latin-1 character 1148 set plus the control characters TAB and NEWLINE." */ 1149 if ((ch < 0x20 && ch != 0x00 && ch != 0x09 && ch != 0x0a) 1150 || (ch >= 0x7f && ch < 0xa0)) { 1151 unconv_num++; 1152 continue; 1153 } 1154 *((unsigned char *) string_ptr++) = ch; 1155 str_len--; 1156 } 1157 1158 *from_left -= csptr - *((const char **) from); 1159 *from = (XPointer) csptr; 1160 1161 *to_left -= string_ptr - *((char **) to); 1162 *to = (XPointer) string_ptr; 1163 1164 return unconv_num; 1165} 1166 1167 1168static XlcConv 1169create_conv( 1170 XlcConvMethods methods) 1171{ 1172 XlcConv conv; 1173 1174 conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + sizeof(StateRec)); 1175 if (conv == NULL) 1176 return (XlcConv) NULL; 1177 1178 conv->state = (XPointer) &conv[1]; 1179 1180 conv->methods = methods; 1181 1182 init_state(conv); 1183 1184 return conv; 1185} 1186 1187static void 1188close_converter( 1189 XlcConv conv) 1190{ 1191 /* conv->state is allocated together with conv, free both at once. */ 1192 Xfree((char *) conv); 1193} 1194 1195 1196static XlcConvMethodsRec cttocs_methods = { 1197 close_converter, 1198 cttocs, 1199 init_state 1200}; 1201 1202static XlcConv 1203open_cttocs( 1204 XLCd from_lcd, 1205 const char *from_type, 1206 XLCd to_lcd, 1207 const char *to_type) 1208{ 1209 return create_conv(&cttocs_methods); 1210} 1211 1212 1213static XlcConvMethodsRec cstoct_methods = { 1214 close_converter, 1215 cstoct, 1216 init_state 1217}; 1218 1219static XlcConv 1220open_cstoct( 1221 XLCd from_lcd, 1222 const char *from_type, 1223 XLCd to_lcd, 1224 const char *to_type) 1225{ 1226 return create_conv(&cstoct_methods); 1227} 1228 1229 1230static XlcConvMethodsRec strtocs_methods = { 1231 close_converter, 1232 strtocs, 1233 init_state 1234}; 1235 1236static XlcConv 1237open_strtocs( 1238 XLCd from_lcd, 1239 const char *from_type, 1240 XLCd to_lcd, 1241 const char *to_type) 1242{ 1243 return create_conv(&strtocs_methods); 1244} 1245 1246 1247static XlcConvMethodsRec cstostr_methods = { 1248 close_converter, 1249 cstostr, 1250 init_state 1251}; 1252 1253static XlcConv 1254open_cstostr( 1255 XLCd from_lcd, 1256 const char *from_type, 1257 XLCd to_lcd, 1258 const char *to_type) 1259{ 1260 return create_conv(&cstostr_methods); 1261} 1262 1263 1264/* =========================== Initialization =========================== */ 1265 1266Bool 1267_XlcInitCTInfo(void) 1268{ 1269 if (ct_list == NULL) { 1270 const CTDataRec *ct_data; 1271 int num; 1272 XlcCharSet charset; 1273 1274 /* Initialize ct_list. */ 1275 1276 num = sizeof(default_ct_data) / sizeof(CTDataRec); 1277 for (ct_data = default_ct_data; num > 0; ct_data++, num--) { 1278 charset = _XlcAddCT(ct_data->name, ct_data->ct_sequence); 1279 if (charset == NULL) 1280 continue; 1281 if (strncmp(charset->ct_sequence, "\x1b\x25\x2f", 3) != 0) 1282 charset->source = CSsrcStd; 1283 else 1284 charset->source = CSsrcXLC; 1285 } 1286 1287 /* Register CompoundText and CharSet converters. */ 1288 1289 _XlcSetConverter((XLCd) NULL, XlcNCompoundText, 1290 (XLCd) NULL, XlcNCharSet, 1291 open_cttocs); 1292 _XlcSetConverter((XLCd) NULL, XlcNString, 1293 (XLCd) NULL, XlcNCharSet, 1294 open_strtocs); 1295 1296 _XlcSetConverter((XLCd) NULL, XlcNCharSet, 1297 (XLCd) NULL, XlcNCompoundText, 1298 open_cstoct); 1299 _XlcSetConverter((XLCd) NULL, XlcNCharSet, 1300 (XLCd) NULL, XlcNString, 1301 open_cstostr); 1302 } 1303 1304 return True; 1305} 1306