lcCT.c revision 9c019ec5
1/*
2 * Copyright 1992, 1993 by TOSHIBA Corp.
3 *
4 * Permission to use, copy, modify, and distribute this software and its
5 * documentation for any purpose and without fee is hereby granted, provided
6 * that the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of TOSHIBA not be used in advertising
9 * or publicity pertaining to distribution of the software without specific,
10 * written prior permission. TOSHIBA make no representations about the
11 * suitability of this software for any purpose.  It is provided "as is"
12 * without express or implied warranty.
13 *
14 * TOSHIBA DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
15 * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
16 * TOSHIBA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
17 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
18 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
19 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
20 * SOFTWARE.
21 *
22 * Author: Katsuhisa Yano	TOSHIBA Corp.
23 *			   	mopi@osa.ilab.toshiba.co.jp
24 */
25/*
26 * Copyright 1995 by FUJITSU LIMITED
27 * This is source code modified by FUJITSU LIMITED under the Joint
28 * Development Agreement for the CDE/Motif PST.
29 *
30 * Modifier: Takanori Tateno   FUJITSU LIMITED
31 *
32 */
33/*
34 *  2000
35 *  Modifier: Ivan Pascal     The XFree86 Project
36 *  Modifier: Bruno Haible    The XFree86 Project
37 */
38
39#ifdef HAVE_CONFIG_H
40#include <config.h>
41#endif
42#include "Xlibint.h"
43#include "XlcPubI.h"
44#include <X11/Xos.h>
45#include <stdio.h>
46
47
48/* ====================== Built-in Character Sets ====================== */
49
50/*
51 * Static representation of a character set that can be used in Compound Text.
52 */
53typedef struct _CTDataRec {
54    const char name[19];
55    const char ct_sequence[5];	/* Compound Text encoding, ESC sequence */
56} CTDataRec, *CTData;
57
58static const CTDataRec default_ct_data[] =
59{
60    /*                                                                    */
61    /* X11 registry name       MIME name         ISO-IR      ESC sequence */
62    /*                                                                    */
63
64    /* Registered character sets with one byte per character */
65    { "ISO8859-1:GL",       /* US-ASCII              6   */  "\033(B" },
66    { "ISO8859-1:GR",       /* ISO-8859-1          100   */  "\033-A" },
67    { "ISO8859-2:GR",       /* ISO-8859-2          101   */  "\033-B" },
68    { "ISO8859-3:GR",       /* ISO-8859-3          109   */  "\033-C" },
69    { "ISO8859-4:GR",       /* ISO-8859-4          110   */  "\033-D" },
70    { "ISO8859-5:GR",       /* ISO-8859-5          144   */  "\033-L" },
71    { "ISO8859-6:GR",       /* ISO-8859-6          127   */  "\033-G" },
72    { "ISO8859-7:GR",       /* ISO-8859-7          126   */  "\033-F" },
73    { "ISO8859-8:GR",       /* ISO-8859-8          138   */  "\033-H" },
74    { "ISO8859-9:GR",       /* ISO-8859-9          148   */  "\033-M" },
75    { "ISO8859-10:GR",      /* ISO-8859-10         157   */  "\033-V" },
76    { "ISO8859-11:GR",      /* ISO-8859-11         166   */  "\033-T" },
77    { "ISO8859-13:GR",      /* ISO-8859-13         179   */  "\033-Y" },
78    { "ISO8859-14:GR",      /* ISO-8859-14         199   */  "\033-_" },
79    { "ISO8859-15:GR",      /* ISO-8859-15         203   */  "\033-b" },
80    { "ISO8859-16:GR",      /* ISO-8859-16         226   */  "\033-f" },
81    { "JISX0201.1976-0:GL", /* ISO-646-JP           14   */  "\033(J" },
82    { "JISX0201.1976-0:GR",                                  "\033)I" },
83#if 0
84    { "TIS620-0:GR",        /* TIS-620             166   */  "\033-T" },
85#endif
86
87    /* Registered character sets with two byte per character */
88    { "GB2312.1980-0:GL",   /* GB_2312-80           58   */ "\033$(A" },
89    { "GB2312.1980-0:GR",   /* GB_2312-80           58   */ "\033$)A" },
90    { "JISX0208.1983-0:GL", /* JIS_X0208-1983       87   */ "\033$(B" },
91    { "JISX0208.1983-0:GR", /* JIS_X0208-1983       87   */ "\033$)B" },
92    { "JISX0208.1990-0:GL", /* JIS_X0208-1990      168   */ "\033$(B" },
93    { "JISX0208.1990-0:GR", /* JIS_X0208-1990      168   */ "\033$)B" },
94    { "JISX0212.1990-0:GL", /* JIS_X0212-1990      159   */ "\033$(D" },
95    { "JISX0212.1990-0:GR", /* JIS_X0212-1990      159   */ "\033$)D" },
96    { "KSC5601.1987-0:GL",  /* KS_C_5601-1987      149   */ "\033$(C" },
97    { "KSC5601.1987-0:GR",  /* KS_C_5601-1987      149   */ "\033$)C" },
98    { "CNS11643.1986-1:GL", /* CNS 11643-1992 pl.1 171   */ "\033$(G" },
99    { "CNS11643.1986-1:GR", /* CNS 11643-1992 pl.1 171   */ "\033$)G" },
100    { "CNS11643.1986-2:GL", /* CNS 11643-1992 pl.2 172   */ "\033$(H" },
101    { "CNS11643.1986-2:GR", /* CNS 11643-1992 pl.2 172   */ "\033$)H" },
102    { "CNS11643.1992-3:GL", /* CNS 11643-1992 pl.3 183   */ "\033$(I" },
103    { "CNS11643.1992-3:GR", /* CNS 11643-1992 pl.3 183   */ "\033$)I" },
104    { "CNS11643.1992-4:GL", /* CNS 11643-1992 pl.4 184   */ "\033$(J" },
105    { "CNS11643.1992-4:GR", /* CNS 11643-1992 pl.4 184   */ "\033$)J" },
106    { "CNS11643.1992-5:GL", /* CNS 11643-1992 pl.5 185   */ "\033$(K" },
107    { "CNS11643.1992-5:GR", /* CNS 11643-1992 pl.5 185   */ "\033$)K" },
108    { "CNS11643.1992-6:GL", /* CNS 11643-1992 pl.6 186   */ "\033$(L" },
109    { "CNS11643.1992-6:GR", /* CNS 11643-1992 pl.6 186   */ "\033$)L" },
110    { "CNS11643.1992-7:GL", /* CNS 11643-1992 pl.7 187   */ "\033$(M" },
111    { "CNS11643.1992-7:GR", /* CNS 11643-1992 pl.7 187   */ "\033$)M" },
112
113    /* Registered encodings with a varying number of bytes per character */
114    { "ISO10646-1",         /* UTF-8               196   */ "\033%G"  },
115
116    /* Encodings without ISO-IR assigned escape sequence must be
117       defined in XLC_LOCALE files, using "\033%/1" or "\033%/2". */
118
119    /* Backward compatibility with XFree86 3.x */
120#if 1
121    { "ISO8859-14:GR",                                      "\033%/1" },
122    { "ISO8859-15:GR",                                      "\033%/1" },
123#endif
124    /* For use by utf8 -> ctext */
125    { "BIG5-0:GLGR", "\033%/2"},
126    { "BIG5HKSCS-0:GLGR", "\033%/2"},
127    { "GBK-0:GLGR", "\033%/2"},
128    /* used by Emacs, but not backed by ISO-IR */
129    { "BIG5-E0:GL", "\033$(0" },
130    { "BIG5-E0:GR", "\033$)0" },
131    { "BIG5-E1:GL", "\033$(1" },
132    { "BIG5-E1:GR", "\033$)1" },
133
134};
135
136/* We represent UTF-8 as an XlcGLGR charset, not in extended segments. */
137#define UTF8_IN_EXTSEQ 0
138
139/* ======================= Parsing ESC Sequences ======================= */
140
141#define XctC0		0x0000
142#define XctHT		0x0009
143#define XctNL		0x000a
144#define XctESC		0x001b
145#define XctGL		0x0020
146#define XctC1		0x0080
147#define XctCSI		0x009b
148#define XctGR		0x00a0
149#define XctSTX		0x0002
150
151#define XctCntrlFunc	0x0023
152#define XctMB		0x0024
153#define XctOtherCoding	0x0025
154#define XctGL94		0x0028
155#define XctGR94		0x0029
156#define XctGR96		0x002d
157#define XctNonStandard	0x002f
158#define XctIgnoreExt	0x0030
159#define XctNotIgnoreExt	0x0031
160#define XctLeftToRight	0x0031
161#define XctRightToLeft	0x0032
162#define XctDirection	0x005d
163#define XctDirectionEnd	0x005d
164
165#define XctGL94MB	0x2428
166#define XctGR94MB	0x2429
167#define XctExtSeg	0x252f
168#define XctReturn	0x2540
169
170/*
171 * Parses the header of a Compound Text segment, i.e. the charset designator.
172 * The string starts at *text and has *length bytes.
173 * Return value is one of:
174 *   0 (no valid charset designator),
175 *   XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB,
176 *   XctLeftToRight, XctRightToLeft, XctDirectionEnd,
177 *   XctExtSeg, XctOtherCoding, XctReturn, XctIgnoreExt, XctNotIgnoreExt.
178 * If the return value is not 0, *text is incremented and *length decremented,
179 * to point past the charset designator. If the return value is one of
180 *   XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB,
181 *   XctExtSeg, XctOtherCoding, XctIgnoreExt, XctNotIgnoreExt,
182 * *final_byte is set to the "final byte" of the charset designator.
183 */
184static unsigned int
185_XlcParseCT(
186    const char **text,
187    int *length,
188    unsigned char *final_byte)
189{
190    unsigned int ret = 0;
191    unsigned char ch;
192    const unsigned char *str = (const unsigned char *) *text;
193
194    *final_byte = 0;
195
196    if (*length < 1)
197        return 0;
198    switch (ch = *str++) {
199        case XctESC:
200            if (*length < 2)
201                return 0;
202            switch (ch = *str++) {
203                case XctOtherCoding:             /* % */
204                    if (*length < 3)
205                        return 0;
206                    ch = *str++;
207                    if (ch == XctNonStandard) {  /* / */
208                        if (*length < 4)
209                            return 0;
210                        ret = XctExtSeg;
211                        ch = *str++;
212                    } else if (ch == '@') {
213                        ret = XctReturn;
214                    } else {
215                        ret = XctOtherCoding;
216                    }
217                    *final_byte = ch;
218                    break;
219
220                case XctCntrlFunc:               /* # */
221                    if (*length < 4)
222                        return 0;
223                    *final_byte = *str++;
224                    switch (*str++) {
225                        case XctIgnoreExt:       /* 0 */
226                            ret = XctIgnoreExt;
227                            break;
228                        case XctNotIgnoreExt:    /* 1 */
229                            ret = XctNotIgnoreExt;
230                            break;
231                        default:
232                            ret = 0;
233                            break;
234                    }
235                    break;
236
237                case XctMB:                      /* $ */
238                    if (*length < 4)
239                        return 0;
240                    ch = *str++;
241                    switch (ch) {
242                        case XctGL94:            /* ( */
243                            ret = XctGL94MB;
244                            break;
245                        case XctGR94:            /* ) */
246                            ret = XctGR94MB;
247                            break;
248                        default:
249                            ret = 0;
250                            break;
251                    }
252                    *final_byte = *str++;
253                    break;
254
255                case XctGL94:                    /* ( */
256                    if (*length < 3)
257                        return 0;
258                    ret = XctGL94;
259                    *final_byte = *str++;
260                    break;
261                case XctGR94:                    /* ) */
262                    if (*length < 3)
263                        return 0;
264                    ret = XctGR94;
265                    *final_byte = *str++;
266                    break;
267                case XctGR96:                    /* - */
268                    if (*length < 3)
269                        return 0;
270                    ret = XctGR96;
271                    *final_byte = *str++;
272                    break;
273            }
274            break;
275        case XctCSI:
276	    /* direction */
277            if (*length < 2)
278                return 0;
279            switch (*str++) {
280                case XctLeftToRight:
281                    if (*length < 3)
282                        return 0;
283                    if (*str++ == XctDirection)
284                        ret = XctLeftToRight;
285                    break;
286                case XctRightToLeft:
287                    if (*length < 3)
288                        return 0;
289                    if (*str++ == XctDirection)
290                        ret = XctRightToLeft;
291                    break;
292                case XctDirectionEnd:
293                    ret = XctDirectionEnd;
294                    break;
295            }
296            break;
297    }
298
299    if (ret) {
300        *length -= (const char *) str - *text;
301        *text = (const char *) str;
302    }
303    return ret;
304}
305
306/*
307 * Fills into a freshly created XlcCharSet the fields that can be inferred
308 * from the ESC sequence. These are side, char_size, set_size.
309 * Returns True if the charset can be used with Compound Text.
310 *
311 * Used by _XlcCreateDefaultCharSet.
312 */
313Bool
314_XlcParseCharSet(
315    XlcCharSet charset)
316{
317    unsigned int type;
318    unsigned char final_byte;
319    const char *ptr = charset->ct_sequence;
320    int length;
321    int char_size;
322
323    if (*ptr == '\0')
324    	return False;
325
326    length = (int) strlen(ptr);
327
328    type = _XlcParseCT(&ptr, &length, &final_byte);
329
330    /* Check for validity and determine char_size.
331       char_size = 0 means varying number of bytes per character. */
332    switch (type) {
333        case XctGL94:
334        case XctGR94:
335        case XctGR96:
336            char_size = 1;
337            break;
338        case XctGL94MB:
339        case XctGR94MB:
340            char_size = (final_byte < 0x60 ? 2 : final_byte < 0x70 ? 3 : 4);
341            break;
342        case XctExtSeg:
343            char_size = final_byte - '0';
344            if (!(char_size >= 0 && char_size <= 4))
345                return False;
346            break;
347        case XctOtherCoding:
348            char_size = 0;
349            break;
350        default:
351            return False;
352    }
353
354    charset->char_size = char_size;
355
356    /* Fill in other values. */
357    switch (type) {
358        case XctGL94:
359        case XctGL94MB:
360            charset->side = XlcGL;
361            charset->set_size = 94;
362            break;
363        case XctGR94:
364        case XctGR94MB:
365            charset->side = XlcGR;
366            charset->set_size = 94;
367            break;
368        case XctGR96:
369            charset->side = XlcGR;
370            charset->set_size = 96;
371            break;
372        case XctExtSeg:
373        case XctOtherCoding:
374            charset->side = XlcGLGR;
375            charset->set_size = 0;
376            break;
377    }
378    return True;
379}
380
381
382/* =============== Management of the List of Character Sets =============== */
383
384/*
385 * Representation of a character set that can be used for Compound Text,
386 * at run time.
387 * Note: This information is not contained in the XlcCharSet, because
388 * multiple ESC sequences may be used for the same XlcCharSet.
389 */
390typedef struct _CTInfoRec {
391    XlcCharSet charset;
392    const char *ct_sequence;	/* Compound Text ESC sequence */
393    unsigned int type;
394    unsigned char final_byte;
395				/* If type == XctExtSeg: */
396    const char *ext_segment;	/* extended segment name, then '\002' */
397    int ext_segment_len;	/* length of above, including final '\002' */
398
399    struct _CTInfoRec *next;
400} CTInfoRec, *CTInfo;
401
402/*
403 * List of character sets that can be used for Compound Text,
404 * Includes all that are listed in default_ct_data, but more can be added
405 * at runtime through _XlcAddCT.
406 */
407static CTInfo ct_list = NULL;
408static CTInfo ct_list_end = NULL;
409
410/*
411 * Returns a Compound Text info record for an ESC sequence.
412 * The first part of the ESC sequence has already been parsed into 'type'
413 * and 'final_byte'. The remainder starts at 'text', at least 'text_len'
414 * bytes (only used if type == XctExtSeg).
415 */
416static CTInfo
417_XlcGetCTInfo(
418    unsigned int type,
419    unsigned char final_byte,
420    const char *text,
421    int text_len)
422{
423    CTInfo ct_info;
424
425    for (ct_info = ct_list; ct_info; ct_info = ct_info->next)
426        if (ct_info->type == type
427            && ct_info->final_byte == final_byte
428            && (type != XctExtSeg
429                || (text_len >= ct_info->ext_segment_len
430                    && memcmp(text, ct_info->ext_segment,
431                              (size_t) ct_info->ext_segment_len) == 0)))
432            return ct_info;
433
434    return (CTInfo) NULL;
435}
436
437/* Returns the Compound Text info for a given XlcCharSet.
438   Returns NULL if none is found. */
439static CTInfo
440_XlcGetCTInfoFromCharSet(
441    XlcCharSet charset)
442{
443    CTInfo ct_info;
444
445    for (ct_info = ct_list; ct_info; ct_info = ct_info->next)
446	if (ct_info->charset == charset)
447	    return ct_info;
448
449    return (CTInfo) NULL;
450}
451
452/* Creates a new XlcCharSet, given its name (including side suffix) and
453   Compound Text ESC sequence (normally at most 4 bytes), and makes it
454   eligible for Compound Text processing. */
455XlcCharSet
456_XlcAddCT(
457    const char *name,
458    const char *ct_sequence)
459{
460    CTInfo ct_info, existing_info;
461    XlcCharSet charset;
462    const char *ct_ptr;
463    int length;
464    unsigned int type;
465    unsigned char final_byte;
466
467    charset = _XlcGetCharSet(name);
468    if (charset != NULL) {
469        /* Even if the charset already exists, it is OK to register a second
470           Compound Text sequence for it. */
471    } else {
472        /* Attempt to create the charset. */
473        charset = _XlcCreateDefaultCharSet(name, ct_sequence);
474        if (charset == NULL)
475	    return (XlcCharSet) NULL;
476        _XlcAddCharSet(charset);
477    }
478
479    /* Allocate a CTinfo record. */
480    length = (int) strlen(ct_sequence);
481    ct_info = Xmalloc(sizeof(CTInfoRec) + length+1);
482    if (ct_info == NULL)
483	return charset;
484
485    ct_info->charset = charset;
486    ct_info->ct_sequence = strcpy((char *) (ct_info + 1), ct_sequence);
487
488    /* Parse the Compound Text sequence. */
489    ct_ptr = ct_sequence;
490    type = _XlcParseCT(&ct_ptr, &length, &final_byte);
491
492    ct_info->type = type;
493    ct_info->final_byte = final_byte;
494
495    switch (type) {
496	case XctGL94:
497	case XctGR94:
498	case XctGR96:
499	case XctGL94MB:
500	case XctGR94MB:
501	case XctOtherCoding:
502            ct_info->ext_segment = NULL;
503            ct_info->ext_segment_len = 0;
504            break;
505	case XctExtSeg: {
506            /* By convention, the extended segment name is the encoding_name
507               in lowercase. */
508            const char *q = charset->encoding_name;
509            int n = (int) strlen(q);
510            char *p;
511
512            /* Ensure ct_info->ext_segment_len <= 0x3fff - 6. */
513            if (n > 0x3fff - 6 - 1) {
514                Xfree(ct_info);
515                return charset;
516            }
517            p = Xmalloc(n+1);
518            if (p == NULL) {
519                Xfree(ct_info);
520                return charset;
521            }
522            ct_info->ext_segment = p;
523            ct_info->ext_segment_len = n+1;
524            for ( ; n > 0; p++, q++, n--)
525                *p = (*q >= 'A' && *q <= 'Z' ? *q - 'A' + 'a' : *q);
526            *p = XctSTX;
527            break;
528        }
529	default:
530            Xfree(ct_info);
531            return (XlcCharSet) NULL;
532    }
533
534    /* Insert it into the list, if not already present. */
535    existing_info =
536        _XlcGetCTInfo(type, ct_info->final_byte,
537                      ct_info->ext_segment, ct_info->ext_segment_len);
538    if (existing_info == NULL) {
539        /* Insert it at the end. If there are duplicates CTinfo entries
540           for the same XlcCharSet, we want the first (standard) one to
541           override the second (user defined) one. */
542	ct_info->next = NULL;
543	if (ct_list_end)
544	    ct_list_end->next = ct_info;
545	else
546	    ct_list = ct_info;
547	ct_list_end = ct_info;
548    } else {
549        if (existing_info->charset != charset
550            /* We have a conflict, with one exception: JISX0208.1983-0 and
551               JISX0208.1990-0 are the same for all practical purposes. */
552            && !(strncmp(existing_info->charset->name, "JISX0208", 8) == 0
553                 && strncmp(charset->name, "JISX0208", 8) == 0)) {
554            fprintf(stderr,
555                    "Xlib: charsets %s and %s have the same CT sequence\n",
556                    charset->name, existing_info->charset->name);
557            if (strcmp(charset->ct_sequence, ct_sequence) == 0)
558                charset->ct_sequence = "";
559        }
560        Xfree(ct_info);
561    }
562
563    return charset;
564}
565
566
567/* ========== Converters String <--> CharSet <--> Compound Text ========== */
568
569/*
570 * Structure representing the parse state of a Compound Text string.
571 */
572typedef struct _StateRec {
573    XlcCharSet charset;		/* The charset of the current segment */
574    XlcCharSet GL_charset;	/* The charset responsible for 0x00..0x7F */
575    XlcCharSet GR_charset;	/* The charset responsible for 0x80..0xFF */
576    XlcCharSet Other_charset;	/* != NULL if currently in an other segment */
577    int ext_seg_left;		/* > 0 if currently in an extended segment */
578} StateRec, *State;
579
580
581/* Subroutine for parsing an ESC sequence. */
582
583typedef enum {
584    resOK,		/* Charset saved in 'state', sequence skipped */
585    resNotInList,	/* Charset not found, sequence skipped */
586    resNotCTSeq		/* EscSeq not recognized, pointers not changed */
587} CheckResult;
588
589static CheckResult
590_XlcCheckCTSequence(
591    State state,
592    const char **ctext,
593    int *ctext_len)
594{
595    XlcCharSet charset;
596    CTInfo ct_info;
597    const char *tmp_ctext = *ctext;
598    int tmp_ctext_len = *ctext_len;
599    unsigned int type;
600    unsigned char final_byte;
601    int ext_seg_left = 0;
602
603    /* Check for validity. */
604    type = _XlcParseCT(&tmp_ctext, &tmp_ctext_len, &final_byte);
605
606    switch (type) {
607	case XctGL94:
608	case XctGR94:
609	case XctGR96:
610	case XctGL94MB:
611	case XctGR94MB:
612	case XctOtherCoding:
613            *ctext = tmp_ctext;
614            *ctext_len = tmp_ctext_len;
615            break;
616        case XctReturn:
617            *ctext = tmp_ctext;
618            *ctext_len = tmp_ctext_len;
619            state->Other_charset = NULL;
620            return resOK;
621        case XctExtSeg:
622            if (tmp_ctext_len > 2
623                && (tmp_ctext[0] & 0x80) && (tmp_ctext[0] & 0x80)) {
624                unsigned int msb = tmp_ctext[0] & 0x7f;
625                unsigned int lsb = tmp_ctext[1] & 0x7f;
626                ext_seg_left = (msb << 7) + lsb;
627                if (ext_seg_left <= tmp_ctext_len - 2) {
628                    *ctext = tmp_ctext + 2;
629                    *ctext_len = tmp_ctext_len - 2;
630                    break;
631                }
632            }
633            return resNotCTSeq;
634        default:
635            return resNotCTSeq;
636    }
637
638    ct_info = _XlcGetCTInfo(type, final_byte, *ctext, ext_seg_left);
639
640    if (ct_info) {
641        charset = ct_info->charset;
642        state->ext_seg_left = ext_seg_left;
643        if (type == XctExtSeg) {
644            state->charset = charset;
645            /* Skip past the extended segment name and the separator. */
646            *ctext += ct_info->ext_segment_len;
647            *ctext_len -= ct_info->ext_segment_len;
648            state->ext_seg_left -= ct_info->ext_segment_len;
649        } else if (type == XctOtherCoding) {
650            state->Other_charset = charset;
651        } else {
652            if (charset->side == XlcGL) {
653                state->GL_charset = charset;
654            } else if (charset->side == XlcGR) {
655                state->GR_charset = charset;
656            } else {
657                state->GL_charset = charset;
658                state->GR_charset = charset;
659            }
660        }
661        return resOK;
662    } else {
663        state->ext_seg_left = 0;
664        if (type == XctExtSeg) {
665            /* Skip the entire extended segment. */
666            *ctext += ext_seg_left;
667            *ctext_len -= ext_seg_left;
668        }
669        return resNotInList;
670    }
671}
672
673static void
674init_state(
675    XlcConv conv)
676{
677    State state = (State) conv->state;
678    static XlcCharSet default_GL_charset = NULL;
679    static XlcCharSet default_GR_charset = NULL;
680
681    if (default_GL_charset == NULL) {
682	default_GL_charset = _XlcGetCharSet("ISO8859-1:GL");
683	default_GR_charset = _XlcGetCharSet("ISO8859-1:GR");
684    }
685
686    /* The initial state is ISO-8859-1 on both sides. */
687    state->GL_charset = state->charset = default_GL_charset;
688    state->GR_charset = default_GR_charset;
689
690    state->Other_charset = NULL;
691
692    state->ext_seg_left = 0;
693}
694
695/* from XlcNCompoundText to XlcNCharSet */
696
697static int
698cttocs(
699    XlcConv conv,
700    XPointer *from,
701    int *from_left,
702    XPointer *to,
703    int *to_left,
704    XPointer *args,
705    int num_args)
706{
707    State state = (State) conv->state;
708    XlcCharSet charset = NULL;
709    const char *ctptr;
710    char *bufptr;
711    int ctext_len, buf_len;
712    int unconv_num = 0;
713
714    ctptr = (const char *) *from;
715    bufptr = (char *) *to;
716    ctext_len = *from_left;
717    buf_len = *to_left;
718
719    while (ctext_len > 0 && buf_len > 0) {
720        if (state->ext_seg_left == 0) {
721            /* Not in the middle of an extended segment; look at next byte. */
722            unsigned char ch = *ctptr;
723            XlcCharSet ch_charset;
724
725            if (ch == XctESC) {
726                CheckResult ret =
727                    _XlcCheckCTSequence(state, &ctptr, &ctext_len);
728                if (ret == resOK)
729                    /* state has been modified. */
730                    continue;
731                if (ret == resNotInList) {
732                    /* XXX Just continue with previous charset. */
733                    unconv_num++;
734                    continue;
735                }
736            } else if (ch == XctCSI) {
737                /* XXX Simply ignore the XctLeftToRight, XctRightToLeft,
738                   XctDirectionEnd sequences for the moment. */
739                unsigned char dummy;
740                if (_XlcParseCT(&ctptr, &ctext_len, &dummy)) {
741                    unconv_num++;
742                    continue;
743                }
744            }
745
746            /* Find the charset which is responsible for this byte. */
747            ch_charset = (state->Other_charset != NULL ? state->Other_charset :
748                          (ch & 0x80 ? state->GR_charset : state->GL_charset));
749
750            /* Set the charset of this run, or continue the current run,
751               or stop the current run. */
752            if (charset) {
753                if (charset != ch_charset)
754                    break;
755            } else {
756                state->charset = charset = ch_charset;
757            }
758
759            /* We don't want to split a character into multiple pieces. */
760            if (buf_len < 6) {
761                if (charset->char_size > 0) {
762                    if (buf_len < charset->char_size)
763                        break;
764                } else {
765                    /* char_size == 0 is tricky. The code here is good only
766                       for valid UTF-8 input. */
767                    if (charset->ct_sequence[0] == XctESC
768                        && charset->ct_sequence[1] == XctOtherCoding
769                        && charset->ct_sequence[2] == 'G') {
770                        int char_size = (ch < 0xc0 ? 1 :
771                                         ch < 0xe0 ? 2 :
772                                         ch < 0xf0 ? 3 :
773                                         ch < 0xf8 ? 4 :
774                                         ch < 0xfc ? 5 :
775                                                     6);
776                        if (buf_len < char_size)
777                            break;
778                    }
779                }
780            }
781
782            *bufptr++ = *ctptr++;
783            ctext_len--;
784            buf_len--;
785        } else {
786            /* Copy as much as possible from the current extended segment
787               to the buffer. */
788            int char_size;
789
790            /* Set the charset of this run, or continue the current run,
791               or stop the current run. */
792            if (charset) {
793                if (charset != state->charset)
794                    break;
795            } else {
796                charset = state->charset;
797            }
798
799            char_size = charset->char_size;
800
801            if (state->ext_seg_left <= buf_len || char_size > 0) {
802                int n = (state->ext_seg_left <= buf_len
803                         ? state->ext_seg_left
804                         : (buf_len / char_size) * char_size);
805                memcpy(bufptr, ctptr, (size_t) n);
806                ctptr += n; ctext_len -= n;
807                bufptr += n; buf_len -= n;
808                state->ext_seg_left -= n;
809            } else {
810#if UTF8_IN_EXTSEQ
811                /* char_size == 0 is tricky. The code here is good only
812                   for valid UTF-8 input. */
813                if (strcmp(charset->name, "ISO10646-1") == 0) {
814                    unsigned char ch = *ctptr;
815                    int char_size = (ch < 0xc0 ? 1 :
816                                     ch < 0xe0 ? 2 :
817                                     ch < 0xf0 ? 3 :
818                                     ch < 0xf8 ? 4 :
819                                     ch < 0xfc ? 5 :
820                                                 6);
821                    int i;
822                    if (buf_len < char_size)
823                        break;
824                    /* A small loop is faster than calling memcpy. */
825                    for (i = char_size; i > 0; i--)
826                        *bufptr++ = *ctptr++;
827                    ctext_len -= char_size;
828                    buf_len -= char_size;
829                    state->ext_seg_left -= char_size;
830                } else
831#endif
832                {
833                    /* Here ctext_len >= state->ext_seg_left > buf_len.
834                       We may be splitting a character into multiple pieces.
835                       Oh well. */
836                    int n = buf_len;
837                    memcpy(bufptr, ctptr, (size_t) n);
838                    ctptr += n; ctext_len -= n;
839                    bufptr += n; buf_len -= n;
840                    state->ext_seg_left -= n;
841                }
842            }
843        }
844    }
845
846    /* 'charset' is the charset for the current run. In some cases,
847       'state->charset' contains the charset for the next run. Therefore,
848       return 'charset'.
849       'charset' may still be NULL only if no output was produced. */
850    if (num_args > 0)
851	*((XlcCharSet *) args[0]) = charset;
852
853    *from_left -= ctptr - *((const char **) from);
854    *from = (XPointer) ctptr;
855
856    *to_left -= bufptr - *((char **) to);
857    *to = (XPointer) bufptr;
858
859    return unconv_num;
860}
861
862/* from XlcNCharSet to XlcNCompoundText */
863
864static int
865cstoct(
866    XlcConv conv,
867    XPointer *from,
868    int *from_left,
869    XPointer *to,
870    int *to_left,
871    XPointer *args,
872    int num_args)
873{
874    State state = (State) conv->state;
875    XlcSide side;
876    unsigned char min_ch = 0, max_ch = 0;
877    int length, unconv_num;
878    CTInfo ct_info;
879    XlcCharSet charset;
880    const char *csptr;
881    char *ctptr;
882    int csstr_len, ct_len;
883    char *ext_segment_start;
884    int char_size;
885
886    /* One argument is required, of type XlcCharSet. */
887    if (num_args < 1)
888	return -1;
889
890    csptr = *((const char **) from);
891    ctptr = *((char **) to);
892    csstr_len = *from_left;
893    ct_len = *to_left;
894
895    charset = (XlcCharSet) args[0];
896
897    ct_info = _XlcGetCTInfoFromCharSet(charset);
898    if (ct_info == NULL)
899	return -1;
900
901    side = charset->side;
902    length = (int) strlen(ct_info->ct_sequence);
903
904    ext_segment_start = NULL;
905
906    if (ct_info->type == XctOtherCoding) {
907        /* Output the Escape sequence for switching to the charset, and
908           reserve room now for the XctReturn sequence at the end. */
909        if (ct_len < length + 3)
910            return -1;
911
912        memcpy(ctptr, ct_info->ct_sequence, (size_t) length);
913        ctptr += length;
914        ct_len -= length + 3;
915    } else
916    /* Test whether the charset is already active. */
917    if (((side == XlcGR || side == XlcGLGR)
918	 && charset != state->GR_charset)
919	|| ((side == XlcGL || side == XlcGLGR)
920	    && charset != state->GL_charset)) {
921
922        /* Output the Escape sequence for switching to the charset. */
923        if (ct_info->type == XctExtSeg) {
924            if (ct_len < length + 2 + ct_info->ext_segment_len)
925                return -1;
926
927            memcpy(ctptr, ct_info->ct_sequence, (size_t) length);
928            ctptr += length;
929            ct_len -= length;
930
931            ctptr += 2;
932            ct_len -= 2;
933            ext_segment_start = ctptr;
934
935            /* The size of an extended segment must fit in 14 bits. */
936            if (ct_len > 0x3fff)
937                ct_len = 0x3fff;
938
939            memcpy(ctptr, ct_info->ext_segment, (size_t) ct_info->ext_segment_len);
940            ctptr += ct_info->ext_segment_len;
941            ct_len -= ct_info->ext_segment_len;
942        } else {
943            if (ct_len < length)
944                return -1;
945
946            memcpy(ctptr, ct_info->ct_sequence, (size_t) length);
947            ctptr += length;
948            ct_len -= length;
949        }
950    }
951
952    /* If the charset has side GL or GR, prepare remapping the characters
953       to the correct side. */
954    if (charset->set_size) {
955        min_ch = 0x20;
956        max_ch = 0x7f;
957        if (charset->set_size == 94) {
958            max_ch--;
959	    if (charset->char_size > 1 || side == XlcGR)
960		min_ch++;
961        }
962    }
963
964    /* Actually copy the contents. */
965    unconv_num = 0;
966    char_size = charset->char_size;
967    if (char_size == 1) {
968	while (csstr_len > 0 && ct_len > 0) {
969	    if (charset->set_size) {
970		/* The CompoundText specification says that the only
971		   control characters allowed are 0x09, 0x0a, 0x1b, 0x9b.
972		   Therefore here we eliminate other control characters. */
973		unsigned char ch = *((const unsigned char *) csptr) & 0x7f;
974		if (!((ch >= min_ch && ch <= max_ch)
975		      || (side == XlcGL
976			  && (ch == 0x00 || ch == 0x09 || ch == 0x0a))
977		      || ((side == XlcGL || side == XlcGR)
978			  && (ch == 0x1b)))) {
979                    csptr++;
980                    csstr_len--;
981		    unconv_num++;
982                    continue;
983 		}
984	    }
985
986	    if (side == XlcGL)
987		*ctptr++ = *csptr++ & 0x7f;
988	    else if (side == XlcGR)
989		*ctptr++ = *csptr++ | 0x80;
990	    else
991		*ctptr++ = *csptr++;
992	    csstr_len--;
993	    ct_len--;
994	}
995    } else if (char_size > 1) {
996	while (csstr_len >= char_size && ct_len >= char_size) {
997	    if (side == XlcGL) {
998		int i;
999		for (i = char_size; i > 0; i--)
1000		    *ctptr++ = *csptr++ & 0x7f;
1001	    } else if (side == XlcGR) {
1002		int i;
1003		for (i = char_size; i > 0; i--)
1004		    *ctptr++ = *csptr++ | 0x80;
1005	    } else {
1006		int i;
1007		for (i = char_size; i > 0; i--)
1008		    *ctptr++ = *csptr++;
1009	    }
1010	    csstr_len -= char_size;
1011	    ct_len -= char_size;
1012	}
1013    } else {
1014        /* char_size = 0. The code here is good only for valid UTF-8 input. */
1015        if ((charset->ct_sequence[0] == XctESC
1016             && charset->ct_sequence[1] == XctOtherCoding
1017             && charset->ct_sequence[2] == 'G')
1018#if UTF8_IN_EXTSEQ
1019            || strcmp(charset->name, "ISO10646-1") == 0
1020#endif
1021           ) {
1022            while (csstr_len > 0 && ct_len > 0) {
1023                unsigned char ch = * (const unsigned char *) csptr;
1024                int ch_size = (ch < 0xc0 ? 1 :
1025                                 ch < 0xe0 ? 2 :
1026                                 ch < 0xf0 ? 3 :
1027                                 ch < 0xf8 ? 4 :
1028                                 ch < 0xfc ? 5 :
1029                                             6);
1030                int i;
1031                if (!(csstr_len >= ch_size && ct_len >= ch_size))
1032                    break;
1033                for (i = ch_size; i > 0; i--)
1034                    *ctptr++ = *csptr++;
1035                csstr_len -= ch_size;
1036                ct_len -= ch_size;
1037            }
1038        } else {
1039            while (csstr_len > 0 && ct_len > 0) {
1040                *ctptr++ = *csptr++;
1041                csstr_len--;
1042                ct_len--;
1043            }
1044        }
1045    }
1046
1047    if (ct_info->type == XctOtherCoding) {
1048        /* Terminate with an XctReturn sequence. */
1049        ctptr[0] = XctESC;
1050        ctptr[1] = XctOtherCoding;
1051        ctptr[2] = '@';
1052        ctptr += 3;
1053    } else if (ext_segment_start != NULL) {
1054        /* Backpatch the extended segment's length. */
1055        int ext_segment_length = ctptr - ext_segment_start;
1056        *(ext_segment_start - 2) = (ext_segment_length >> 7) | 0x80;
1057        *(ext_segment_start - 1) = (ext_segment_length & 0x7f) | 0x80;
1058    } else {
1059        if (side == XlcGR || side == XlcGLGR)
1060            state->GR_charset = charset;
1061        if (side == XlcGL || side == XlcGLGR)
1062            state->GL_charset = charset;
1063    }
1064
1065    *from_left -= csptr - *((const char **) from);
1066    *from = (XPointer) csptr;
1067
1068    *to_left -= ctptr - *((char **) to);
1069    *to = (XPointer) ctptr;
1070
1071    return 0;
1072}
1073
1074/* from XlcNString to XlcNCharSet */
1075
1076static int
1077strtocs(
1078    XlcConv conv,
1079    XPointer *from,
1080    int *from_left,
1081    XPointer *to,
1082    int *to_left,
1083    XPointer *args,
1084    int num_args)
1085{
1086    State state = (State) conv->state;
1087    const char *src;
1088    char *dst;
1089    unsigned char side;
1090    int length;
1091
1092    src = (const char *) *from;
1093    dst = (char *) *to;
1094
1095    length = min(*from_left, *to_left);
1096    side = *((const unsigned char *) src) & 0x80;
1097
1098    while (side == (*((const unsigned char *) src) & 0x80) && length-- > 0)
1099	*dst++ = *src++;
1100
1101    *from_left -= src - (const char *) *from;
1102    *from = (XPointer) src;
1103    *to_left -= dst - (char *) *to;
1104    *to = (XPointer) dst;
1105
1106    if (num_args > 0)
1107	*((XlcCharSet *)args[0]) = (side ? state->GR_charset : state->GL_charset);
1108
1109    return 0;
1110}
1111
1112/* from XlcNCharSet to XlcNString */
1113
1114static int
1115cstostr(
1116    XlcConv conv,
1117    XPointer *from,
1118    int *from_left,
1119    XPointer *to,
1120    int *to_left,
1121    XPointer *args,
1122    int num_args)
1123{
1124    State state = (State) conv->state;
1125    const char *csptr;
1126    char *string_ptr;
1127    int csstr_len, str_len;
1128    unsigned char ch;
1129    int unconv_num = 0;
1130
1131    /* This converter can only convert from ISO8859-1:GL and ISO8859-1:GR. */
1132    if (num_args < 1
1133	|| !((XlcCharSet) args[0] == state->GL_charset
1134	     || (XlcCharSet) args[0] == state->GR_charset))
1135	return -1;
1136
1137    csptr = *((const char **) from);
1138    string_ptr = *((char **) to);
1139    csstr_len = *from_left;
1140    str_len = *to_left;
1141
1142    while (csstr_len > 0 && str_len > 0) {
1143	ch = *((const unsigned char *) csptr++);
1144	csstr_len--;
1145	/* Citing ICCCM: "STRING as a type specifies the ISO Latin-1 character
1146	   set plus the control characters TAB and NEWLINE." */
1147	if ((ch < 0x20 && ch != 0x00 && ch != 0x09 && ch != 0x0a)
1148	    || (ch >= 0x7f && ch < 0xa0)) {
1149	    unconv_num++;
1150	    continue;
1151	}
1152	*((unsigned char *) string_ptr++) = ch;
1153	str_len--;
1154    }
1155
1156    *from_left -= csptr - *((const char **) from);
1157    *from = (XPointer) csptr;
1158
1159    *to_left -= string_ptr - *((char **) to);
1160    *to = (XPointer) string_ptr;
1161
1162    return unconv_num;
1163}
1164
1165
1166static XlcConv
1167create_conv(
1168    XlcConvMethods methods)
1169{
1170    XlcConv conv;
1171
1172    conv = Xmalloc(sizeof(XlcConvRec) + sizeof(StateRec));
1173    if (conv == NULL)
1174	return (XlcConv) NULL;
1175
1176    conv->state = (XPointer) &conv[1];
1177
1178    conv->methods = methods;
1179
1180    init_state(conv);
1181
1182    return conv;
1183}
1184
1185static void
1186close_converter(
1187    XlcConv conv)
1188{
1189    /* conv->state is allocated together with conv, free both at once.  */
1190    Xfree(conv);
1191}
1192
1193
1194static XlcConvMethodsRec cttocs_methods = {
1195    close_converter,
1196    cttocs,
1197    init_state
1198};
1199
1200static XlcConv
1201open_cttocs(
1202    XLCd from_lcd,
1203    const char *from_type,
1204    XLCd to_lcd,
1205    const char *to_type)
1206{
1207    return create_conv(&cttocs_methods);
1208}
1209
1210
1211static XlcConvMethodsRec cstoct_methods = {
1212    close_converter,
1213    cstoct,
1214    init_state
1215};
1216
1217static XlcConv
1218open_cstoct(
1219    XLCd from_lcd,
1220    const char *from_type,
1221    XLCd to_lcd,
1222    const char *to_type)
1223{
1224    return create_conv(&cstoct_methods);
1225}
1226
1227
1228static XlcConvMethodsRec strtocs_methods = {
1229    close_converter,
1230    strtocs,
1231    init_state
1232};
1233
1234static XlcConv
1235open_strtocs(
1236    XLCd from_lcd,
1237    const char *from_type,
1238    XLCd to_lcd,
1239    const char *to_type)
1240{
1241    return create_conv(&strtocs_methods);
1242}
1243
1244
1245static XlcConvMethodsRec cstostr_methods = {
1246    close_converter,
1247    cstostr,
1248    init_state
1249};
1250
1251static XlcConv
1252open_cstostr(
1253    XLCd from_lcd,
1254    const char *from_type,
1255    XLCd to_lcd,
1256    const char *to_type)
1257{
1258    return create_conv(&cstostr_methods);
1259}
1260
1261
1262/* =========================== Initialization =========================== */
1263
1264Bool
1265_XlcInitCTInfo(void)
1266{
1267    if (ct_list == NULL) {
1268        const CTDataRec *ct_data;
1269        int num;
1270        XlcCharSet charset;
1271
1272        /* Initialize ct_list.  */
1273
1274	num = sizeof(default_ct_data) / sizeof(CTDataRec);
1275	for (ct_data = default_ct_data; num > 0; ct_data++, num--) {
1276	    charset = _XlcAddCT(ct_data->name, ct_data->ct_sequence);
1277            if (charset == NULL)
1278                continue;
1279			if (strncmp(charset->ct_sequence, "\x1b\x25\x2f", 3) != 0)
1280				charset->source = CSsrcStd;
1281			else
1282				charset->source = CSsrcXLC;
1283	}
1284
1285        /* Register CompoundText and CharSet converters.  */
1286
1287        _XlcSetConverter((XLCd) NULL, XlcNCompoundText,
1288                         (XLCd) NULL, XlcNCharSet,
1289                         open_cttocs);
1290        _XlcSetConverter((XLCd) NULL, XlcNString,
1291                         (XLCd) NULL, XlcNCharSet,
1292                         open_strtocs);
1293
1294        _XlcSetConverter((XLCd) NULL, XlcNCharSet,
1295                         (XLCd) NULL, XlcNCompoundText,
1296                         open_cstoct);
1297        _XlcSetConverter((XLCd) NULL, XlcNCharSet,
1298                         (XLCd) NULL, XlcNString,
1299                         open_cstostr);
1300    }
1301
1302    return True;
1303}
1304