lcCT.c revision 61b2299d
1/* $Xorg: lcCT.c,v 1.4 2000/08/17 19:45:16 cpqbld Exp $ */
2/*
3 * Copyright 1992, 1993 by TOSHIBA Corp.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation for any purpose and without fee is hereby granted, provided
7 * that the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of TOSHIBA not be used in advertising
10 * or publicity pertaining to distribution of the software without specific,
11 * written prior permission. TOSHIBA make no representations about the
12 * suitability of this software for any purpose.  It is provided "as is"
13 * without express or implied warranty.
14 *
15 * TOSHIBA DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
16 * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
17 * TOSHIBA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
18 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
19 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
20 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21 * SOFTWARE.
22 *
23 * Author: Katsuhisa Yano	TOSHIBA Corp.
24 *			   	mopi@osa.ilab.toshiba.co.jp
25 */
26/*
27 * Copyright 1995 by FUJITSU LIMITED
28 * This is source code modified by FUJITSU LIMITED under the Joint
29 * Development Agreement for the CDE/Motif PST.
30 *
31 * Modifier: Takanori Tateno   FUJITSU LIMITED
32 *
33 */
34/*
35 *  2000
36 *  Modifier: Ivan Pascal     The XFree86 Project
37 *  Modifier: Bruno Haible    The XFree86 Project
38 */
39/* $XFree86: xc/lib/X11/lcCT.c,v 3.26 2001/10/28 03:32:34 tsi Exp $ */
40
41#ifdef HAVE_CONFIG_H
42#include <config.h>
43#endif
44#include "Xlibint.h"
45#include "XlcPubI.h"
46#include <X11/Xos.h>
47#include <stdio.h>
48
49
50/* ====================== Built-in Character Sets ====================== */
51
52/*
53 * Static representation of a character set that can be used in Compound Text.
54 */
55typedef struct _CTDataRec {
56    const char name[19];
57    const char ct_sequence[5];	/* Compound Text encoding, ESC sequence */
58} CTDataRec, *CTData;
59
60static const CTDataRec default_ct_data[] =
61{
62    /*                                                                    */
63    /* X11 registry name       MIME name         ISO-IR      ESC sequence */
64    /*                                                                    */
65
66    /* Registered character sets with one byte per character */
67    { "ISO8859-1:GL",       /* US-ASCII              6   */  "\033(B" },
68    { "ISO8859-1:GR",       /* ISO-8859-1          100   */  "\033-A" },
69    { "ISO8859-2:GR",       /* ISO-8859-2          101   */  "\033-B" },
70    { "ISO8859-3:GR",       /* ISO-8859-3          109   */  "\033-C" },
71    { "ISO8859-4:GR",       /* ISO-8859-4          110   */  "\033-D" },
72    { "ISO8859-5:GR",       /* ISO-8859-5          144   */  "\033-L" },
73    { "ISO8859-6:GR",       /* ISO-8859-6          127   */  "\033-G" },
74    { "ISO8859-7:GR",       /* ISO-8859-7          126   */  "\033-F" },
75    { "ISO8859-8:GR",       /* ISO-8859-8          138   */  "\033-H" },
76    { "ISO8859-9:GR",       /* ISO-8859-9          148   */  "\033-M" },
77    { "ISO8859-10:GR",      /* ISO-8859-10         157   */  "\033-V" },
78    { "ISO8859-11:GR",      /* ISO-8859-11         166   */  "\033-T" },
79    { "ISO8859-13:GR",      /* ISO-8859-13         179   */  "\033-Y" },
80    { "ISO8859-14:GR",      /* ISO-8859-14         199   */  "\033-_" },
81    { "ISO8859-15:GR",      /* ISO-8859-15         203   */  "\033-b" },
82    { "ISO8859-16:GR",      /* ISO-8859-16         226   */  "\033-f" },
83    { "JISX0201.1976-0:GL", /* ISO-646-JP           14   */  "\033(J" },
84    { "JISX0201.1976-0:GR",                                  "\033)I" },
85#if 0
86    { "TIS620-0:GR",        /* TIS-620             166   */  "\033-T" },
87#endif
88
89    /* Registered character sets with two byte per character */
90    { "GB2312.1980-0:GL",   /* GB_2312-80           58   */ "\033$(A" },
91    { "GB2312.1980-0:GR",   /* GB_2312-80           58   */ "\033$)A" },
92    { "JISX0208.1983-0:GL", /* JIS_X0208-1983       87   */ "\033$(B" },
93    { "JISX0208.1983-0:GR", /* JIS_X0208-1983       87   */ "\033$)B" },
94    { "JISX0208.1990-0:GL", /* JIS_X0208-1990      168   */ "\033$(B" },
95    { "JISX0208.1990-0:GR", /* JIS_X0208-1990      168   */ "\033$)B" },
96    { "JISX0212.1990-0:GL", /* JIS_X0212-1990      159   */ "\033$(D" },
97    { "JISX0212.1990-0:GR", /* JIS_X0212-1990      159   */ "\033$)D" },
98    { "KSC5601.1987-0:GL",  /* KS_C_5601-1987      149   */ "\033$(C" },
99    { "KSC5601.1987-0:GR",  /* KS_C_5601-1987      149   */ "\033$)C" },
100    { "CNS11643.1986-1:GL", /* CNS 11643-1992 pl.1 171   */ "\033$(G" },
101    { "CNS11643.1986-1:GR", /* CNS 11643-1992 pl.1 171   */ "\033$)G" },
102    { "CNS11643.1986-2:GL", /* CNS 11643-1992 pl.2 172   */ "\033$(H" },
103    { "CNS11643.1986-2:GR", /* CNS 11643-1992 pl.2 172   */ "\033$)H" },
104    { "CNS11643.1992-3:GL", /* CNS 11643-1992 pl.3 183   */ "\033$(I" },
105    { "CNS11643.1992-3:GR", /* CNS 11643-1992 pl.3 183   */ "\033$)I" },
106    { "CNS11643.1992-4:GL", /* CNS 11643-1992 pl.4 184   */ "\033$(J" },
107    { "CNS11643.1992-4:GR", /* CNS 11643-1992 pl.4 184   */ "\033$)J" },
108    { "CNS11643.1992-5:GL", /* CNS 11643-1992 pl.5 185   */ "\033$(K" },
109    { "CNS11643.1992-5:GR", /* CNS 11643-1992 pl.5 185   */ "\033$)K" },
110    { "CNS11643.1992-6:GL", /* CNS 11643-1992 pl.6 186   */ "\033$(L" },
111    { "CNS11643.1992-6:GR", /* CNS 11643-1992 pl.6 186   */ "\033$)L" },
112    { "CNS11643.1992-7:GL", /* CNS 11643-1992 pl.7 187   */ "\033$(M" },
113    { "CNS11643.1992-7:GR", /* CNS 11643-1992 pl.7 187   */ "\033$)M" },
114
115    /* Registered encodings with a varying number of bytes per character */
116    { "ISO10646-1",         /* UTF-8               196   */ "\033%G"  },
117
118    /* Encodings without ISO-IR assigned escape sequence must be
119       defined in XLC_LOCALE files, using "\033%/1" or "\033%/2". */
120
121    /* Backward compatibility with XFree86 3.x */
122#if 1
123    { "ISO8859-14:GR",                                      "\033%/1" },
124    { "ISO8859-15:GR",                                      "\033%/1" },
125#endif
126    /* For use by utf8 -> ctext */
127    { "BIG5-0:GLGR", "\033%/2"},
128    { "BIG5HKSCS-0:GLGR", "\033%/2"},
129    { "GBK-0:GLGR", "\033%/2"},
130    /* used by Emacs, but not backed by ISO-IR */
131    { "BIG5-E0:GL", "\033$(0" },
132    { "BIG5-E0:GR", "\033$)0" },
133    { "BIG5-E1:GL", "\033$(1" },
134    { "BIG5-E1:GR", "\033$)1" },
135
136};
137
138/* We represent UTF-8 as an XlcGLGR charset, not in extended segments. */
139#define UTF8_IN_EXTSEQ 0
140
141/* ======================= Parsing ESC Sequences ======================= */
142
143#define XctC0		0x0000
144#define XctHT		0x0009
145#define XctNL		0x000a
146#define XctESC		0x001b
147#define XctGL		0x0020
148#define XctC1		0x0080
149#define XctCSI		0x009b
150#define XctGR		0x00a0
151#define XctSTX		0x0002
152
153#define XctCntrlFunc	0x0023
154#define XctMB		0x0024
155#define XctOtherCoding	0x0025
156#define XctGL94		0x0028
157#define XctGR94		0x0029
158#define XctGR96		0x002d
159#define XctNonStandard	0x002f
160#define XctIgnoreExt	0x0030
161#define XctNotIgnoreExt	0x0031
162#define XctLeftToRight	0x0031
163#define XctRightToLeft	0x0032
164#define XctDirection	0x005d
165#define XctDirectionEnd	0x005d
166
167#define XctGL94MB	0x2428
168#define XctGR94MB	0x2429
169#define XctExtSeg	0x252f
170#define XctReturn	0x2540
171
172/*
173 * Parses the header of a Compound Text segment, i.e. the charset designator.
174 * The string starts at *text and has *length bytes.
175 * Return value is one of:
176 *   0 (no valid charset designator),
177 *   XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB,
178 *   XctLeftToRight, XctRightToLeft, XctDirectionEnd,
179 *   XctExtSeg, XctOtherCoding, XctReturn, XctIgnoreExt, XctNotIgnoreExt.
180 * If the return value is not 0, *text is incremented and *length decremented,
181 * to point past the charset designator. If the return value is one of
182 *   XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB,
183 *   XctExtSeg, XctOtherCoding, XctIgnoreExt, XctNotIgnoreExt,
184 * *final_byte is set to the "final byte" of the charset designator.
185 */
186static unsigned int
187_XlcParseCT(
188    const char **text,
189    int *length,
190    unsigned char *final_byte)
191{
192    unsigned int ret = 0;
193    unsigned char ch;
194    const unsigned char *str = (const unsigned char *) *text;
195
196    *final_byte = 0;
197
198    if (*length < 1)
199        return 0;
200    switch (ch = *str++) {
201        case XctESC:
202            if (*length < 2)
203                return 0;
204            switch (ch = *str++) {
205                case XctOtherCoding:             /* % */
206                    if (*length < 3)
207                        return 0;
208                    ch = *str++;
209                    if (ch == XctNonStandard) {  /* / */
210                        if (*length < 4)
211                            return 0;
212                        ret = XctExtSeg;
213                        ch = *str++;
214                    } else if (ch == '@') {
215                        ret = XctReturn;
216                    } else {
217                        ret = XctOtherCoding;
218                    }
219                    *final_byte = ch;
220                    break;
221
222                case XctCntrlFunc:               /* # */
223                    if (*length < 4)
224                        return 0;
225                    *final_byte = *str++;
226                    switch (*str++) {
227                        case XctIgnoreExt:       /* 0 */
228                            ret = XctIgnoreExt;
229                            break;
230                        case XctNotIgnoreExt:    /* 1 */
231                            ret = XctNotIgnoreExt;
232                            break;
233                        default:
234                            ret = 0;
235                            break;
236                    }
237                    break;
238
239                case XctMB:                      /* $ */
240                    if (*length < 4)
241                        return 0;
242                    ch = *str++;
243                    switch (ch) {
244                        case XctGL94:            /* ( */
245                            ret = XctGL94MB;
246                            break;
247                        case XctGR94:            /* ) */
248                            ret = XctGR94MB;
249                            break;
250                        default:
251                            ret = 0;
252                            break;
253                    }
254                    *final_byte = *str++;
255                    break;
256
257                case XctGL94:                    /* ( */
258                    if (*length < 3)
259                        return 0;
260                    ret = XctGL94;
261                    *final_byte = *str++;
262                    break;
263                case XctGR94:                    /* ) */
264                    if (*length < 3)
265                        return 0;
266                    ret = XctGR94;
267                    *final_byte = *str++;
268                    break;
269                case XctGR96:                    /* - */
270                    if (*length < 3)
271                        return 0;
272                    ret = XctGR96;
273                    *final_byte = *str++;
274                    break;
275            }
276            break;
277        case XctCSI:
278	    /* direction */
279            if (*length < 2)
280                return 0;
281            switch (*str++) {
282                case XctLeftToRight:
283                    if (*length < 3)
284                        return 0;
285                    if (*str++ == XctDirection)
286                        ret = XctLeftToRight;
287                    break;
288                case XctRightToLeft:
289                    if (*length < 3)
290                        return 0;
291                    if (*str++ == XctDirection)
292                        ret = XctRightToLeft;
293                    break;
294                case XctDirectionEnd:
295                    ret = XctDirectionEnd;
296                    break;
297            }
298            break;
299    }
300
301    if (ret) {
302        *length -= (const char *) str - *text;
303        *text = (const char *) str;
304    }
305    return ret;
306}
307
308/*
309 * Fills into a freshly created XlcCharSet the fields that can be inferred
310 * from the ESC sequence. These are side, char_size, set_size.
311 * Returns True if the charset can be used with Compound Text.
312 *
313 * Used by _XlcCreateDefaultCharSet.
314 */
315Bool
316_XlcParseCharSet(
317    XlcCharSet charset)
318{
319    unsigned int type;
320    unsigned char final_byte;
321    const char *ptr = charset->ct_sequence;
322    int length;
323    int char_size;
324
325    if (*ptr == '\0')
326    	return False;
327
328    length = strlen(ptr);
329
330    type = _XlcParseCT(&ptr, &length, &final_byte);
331
332    /* Check for validity and determine char_size.
333       char_size = 0 means varying number of bytes per character. */
334    switch (type) {
335        case XctGL94:
336        case XctGR94:
337        case XctGR96:
338            char_size = 1;
339            break;
340        case XctGL94MB:
341        case XctGR94MB:
342            char_size = (final_byte < 0x60 ? 2 : final_byte < 0x70 ? 3 : 4);
343            break;
344        case XctExtSeg:
345            char_size = final_byte - '0';
346            if (!(char_size >= 0 && char_size <= 4))
347                return False;
348            break;
349        case XctOtherCoding:
350            char_size = 0;
351            break;
352        default:
353            return False;
354    }
355
356    charset->char_size = char_size;
357
358    /* Fill in other values. */
359    switch (type) {
360        case XctGL94:
361        case XctGL94MB:
362            charset->side = XlcGL;
363            charset->set_size = 94;
364            break;
365        case XctGR94:
366        case XctGR94MB:
367            charset->side = XlcGR;
368            charset->set_size = 94;
369            break;
370        case XctGR96:
371            charset->side = XlcGR;
372            charset->set_size = 96;
373            break;
374        case XctExtSeg:
375        case XctOtherCoding:
376            charset->side = XlcGLGR;
377            charset->set_size = 0;
378            break;
379    }
380    return True;
381}
382
383
384/* =============== Management of the List of Character Sets =============== */
385
386/*
387 * Representation of a character set that can be used for Compound Text,
388 * at run time.
389 * Note: This information is not contained in the XlcCharSet, because
390 * multiple ESC sequences may be used for the same XlcCharSet.
391 */
392typedef struct _CTInfoRec {
393    XlcCharSet charset;
394    const char *ct_sequence;	/* Compound Text ESC sequence */
395    unsigned int type;
396    unsigned char final_byte;
397				/* If type == XctExtSeg: */
398    const char *ext_segment;	/* extended segment name, then '\002' */
399    int ext_segment_len;	/* length of above, including final '\002' */
400
401    struct _CTInfoRec *next;
402} CTInfoRec, *CTInfo;
403
404/*
405 * List of character sets that can be used for Compound Text,
406 * Includes all that are listed in default_ct_data, but more can be added
407 * at runtime through _XlcAddCT.
408 */
409static CTInfo ct_list = NULL;
410static CTInfo ct_list_end = NULL;
411
412/*
413 * Returns a Compound Text info record for an ESC sequence.
414 * The first part of the ESC sequence has already been parsed into 'type'
415 * and 'final_byte'. The remainder starts at 'text', at least 'text_len'
416 * bytes (only used if type == XctExtSeg).
417 */
418static CTInfo
419_XlcGetCTInfo(
420    unsigned int type,
421    unsigned char final_byte,
422    const char *text,
423    int text_len)
424{
425    CTInfo ct_info;
426
427    for (ct_info = ct_list; ct_info; ct_info = ct_info->next)
428        if (ct_info->type == type
429            && ct_info->final_byte == final_byte
430            && (type != XctExtSeg
431                || (text_len >= ct_info->ext_segment_len
432                    && memcmp(text, ct_info->ext_segment,
433                              ct_info->ext_segment_len) == 0)))
434            return ct_info;
435
436    return (CTInfo) NULL;
437}
438
439/* Returns the Compound Text info for a given XlcCharSet.
440   Returns NULL if none is found. */
441static CTInfo
442_XlcGetCTInfoFromCharSet(
443    XlcCharSet charset)
444{
445    CTInfo ct_info;
446
447    for (ct_info = ct_list; ct_info; ct_info = ct_info->next)
448	if (ct_info->charset == charset)
449	    return ct_info;
450
451    return (CTInfo) NULL;
452}
453
454/* Creates a new XlcCharSet, given its name (including side suffix) and
455   Compound Text ESC sequence (normally at most 4 bytes), and makes it
456   eligible for Compound Text processing. */
457XlcCharSet
458_XlcAddCT(
459    const char *name,
460    const char *ct_sequence)
461{
462    CTInfo ct_info, existing_info;
463    XlcCharSet charset;
464    const char *ct_ptr;
465    int length;
466    unsigned int type;
467    unsigned char final_byte;
468
469    charset = _XlcGetCharSet(name);
470    if (charset != NULL) {
471        /* Even if the charset already exists, it is OK to register a second
472           Compound Text sequence for it. */
473    } else {
474        /* Attempt to create the charset. */
475        charset = _XlcCreateDefaultCharSet(name, ct_sequence);
476        if (charset == NULL)
477	    return (XlcCharSet) NULL;
478        _XlcAddCharSet(charset);
479    }
480
481    /* Allocate a CTinfo record. */
482    length = strlen(ct_sequence);
483    ct_info = (CTInfo) Xmalloc(sizeof(CTInfoRec) + length+1);
484    if (ct_info == NULL)
485	return charset;
486
487    ct_info->charset = charset;
488    ct_info->ct_sequence = strcpy((char *) (ct_info + 1), ct_sequence);
489
490    /* Parse the Compound Text sequence. */
491    ct_ptr = ct_sequence;
492    type = _XlcParseCT(&ct_ptr, &length, &final_byte);
493
494    ct_info->type = type;
495    ct_info->final_byte = final_byte;
496
497    switch (type) {
498	case XctGL94:
499	case XctGR94:
500	case XctGR96:
501	case XctGL94MB:
502	case XctGR94MB:
503	case XctOtherCoding:
504            ct_info->ext_segment = NULL;
505            ct_info->ext_segment_len = 0;
506            break;
507	case XctExtSeg: {
508            /* By convention, the extended segment name is the encoding_name
509               in lowercase. */
510            const char *q = charset->encoding_name;
511            int n = strlen(q);
512            char *p;
513
514            /* Ensure ct_info->ext_segment_len <= 0x3fff - 6. */
515            if (n > 0x3fff - 6 - 1) {
516                Xfree(ct_info);
517                return charset;
518            }
519            p = (char *) Xmalloc(n+1);
520            if (p == NULL) {
521                Xfree(ct_info);
522                return charset;
523            }
524            ct_info->ext_segment = p;
525            ct_info->ext_segment_len = n+1;
526            for ( ; n > 0; p++, q++, n--)
527                *p = (*q >= 'A' && *q <= 'Z' ? *q - 'A' + 'a' : *q);
528            *p = XctSTX;
529            break;
530        }
531	default:
532            Xfree(ct_info);
533            return (XlcCharSet) NULL;
534    }
535
536    /* Insert it into the list, if not already present. */
537    existing_info =
538        _XlcGetCTInfo(type, ct_info->final_byte,
539                      ct_info->ext_segment, ct_info->ext_segment_len);
540    if (existing_info == NULL) {
541        /* Insert it at the end. If there are duplicates CTinfo entries
542           for the same XlcCharSet, we want the first (standard) one to
543           override the second (user defined) one. */
544	ct_info->next = NULL;
545	if (ct_list_end)
546	    ct_list_end->next = ct_info;
547	else
548	    ct_list = ct_info;
549	ct_list_end = ct_info;
550    } else {
551        if (existing_info->charset != charset
552            /* We have a conflict, with one exception: JISX0208.1983-0 and
553               JISX0208.1990-0 are the same for all practical purposes. */
554            && !(strncmp(existing_info->charset->name, "JISX0208", 8) == 0
555                 && strncmp(charset->name, "JISX0208", 8) == 0)) {
556            fprintf(stderr,
557                    "Xlib: charsets %s and %s have the same CT sequence\n",
558                    charset->name, existing_info->charset->name);
559            if (strcmp(charset->ct_sequence, ct_sequence) == 0)
560                charset->ct_sequence = "";
561        }
562        Xfree(ct_info);
563    }
564
565    return charset;
566}
567
568
569/* ========== Converters String <--> CharSet <--> Compound Text ========== */
570
571/*
572 * Structure representing the parse state of a Compound Text string.
573 */
574typedef struct _StateRec {
575    XlcCharSet charset;		/* The charset of the current segment */
576    XlcCharSet GL_charset;	/* The charset responsible for 0x00..0x7F */
577    XlcCharSet GR_charset;	/* The charset responsible for 0x80..0xFF */
578    XlcCharSet Other_charset;	/* != NULL if currently in an other segment */
579    int ext_seg_left;		/* > 0 if currently in an extended segment */
580} StateRec, *State;
581
582
583/* Subroutine for parsing an ESC sequence. */
584
585typedef enum {
586    resOK,		/* Charset saved in 'state', sequence skipped */
587    resNotInList,	/* Charset not found, sequence skipped */
588    resNotCTSeq		/* EscSeq not recognized, pointers not changed */
589} CheckResult;
590
591static CheckResult
592_XlcCheckCTSequence(
593    State state,
594    const char **ctext,
595    int *ctext_len)
596{
597    XlcCharSet charset;
598    CTInfo ct_info;
599    const char *tmp_ctext = *ctext;
600    int tmp_ctext_len = *ctext_len;
601    unsigned int type;
602    unsigned char final_byte;
603    int ext_seg_left = 0;
604
605    /* Check for validity. */
606    type = _XlcParseCT(&tmp_ctext, &tmp_ctext_len, &final_byte);
607
608    switch (type) {
609	case XctGL94:
610	case XctGR94:
611	case XctGR96:
612	case XctGL94MB:
613	case XctGR94MB:
614	case XctOtherCoding:
615            *ctext = tmp_ctext;
616            *ctext_len = tmp_ctext_len;
617            break;
618        case XctReturn:
619            *ctext = tmp_ctext;
620            *ctext_len = tmp_ctext_len;
621            state->Other_charset = NULL;
622            return resOK;
623        case XctExtSeg:
624            if (tmp_ctext_len > 2
625                && (tmp_ctext[0] & 0x80) && (tmp_ctext[0] & 0x80)) {
626                unsigned int msb = tmp_ctext[0] & 0x7f;
627                unsigned int lsb = tmp_ctext[1] & 0x7f;
628                ext_seg_left = (msb << 7) + lsb;
629                if (ext_seg_left <= tmp_ctext_len - 2) {
630                    *ctext = tmp_ctext + 2;
631                    *ctext_len = tmp_ctext_len - 2;
632                    break;
633                }
634            }
635            return resNotCTSeq;
636        default:
637            return resNotCTSeq;
638    }
639
640    ct_info = _XlcGetCTInfo(type, final_byte, *ctext, ext_seg_left);
641
642    if (ct_info) {
643        charset = ct_info->charset;
644        state->ext_seg_left = ext_seg_left;
645        if (type == XctExtSeg) {
646            state->charset = charset;
647            /* Skip past the extended segment name and the separator. */
648            *ctext += ct_info->ext_segment_len;
649            *ctext_len -= ct_info->ext_segment_len;
650            state->ext_seg_left -= ct_info->ext_segment_len;
651        } else if (type == XctOtherCoding) {
652            state->Other_charset = charset;
653        } else {
654            if (charset->side == XlcGL) {
655                state->GL_charset = charset;
656            } else if (charset->side == XlcGR) {
657                state->GR_charset = charset;
658            } else {
659                state->GL_charset = charset;
660                state->GR_charset = charset;
661            }
662        }
663        return resOK;
664    } else {
665        state->ext_seg_left = 0;
666        if (type == XctExtSeg) {
667            /* Skip the entire extended segment. */
668            *ctext += ext_seg_left;
669            *ctext_len -= ext_seg_left;
670        }
671        return resNotInList;
672    }
673}
674
675static void
676init_state(
677    XlcConv conv)
678{
679    State state = (State) conv->state;
680    static XlcCharSet default_GL_charset = NULL;
681    static XlcCharSet default_GR_charset = NULL;
682
683    if (default_GL_charset == NULL) {
684	default_GL_charset = _XlcGetCharSet("ISO8859-1:GL");
685	default_GR_charset = _XlcGetCharSet("ISO8859-1:GR");
686    }
687
688    /* The initial state is ISO-8859-1 on both sides. */
689    state->GL_charset = state->charset = default_GL_charset;
690    state->GR_charset = default_GR_charset;
691
692    state->Other_charset = NULL;
693
694    state->ext_seg_left = 0;
695}
696
697/* from XlcNCompoundText to XlcNCharSet */
698
699static int
700cttocs(
701    XlcConv conv,
702    XPointer *from,
703    int *from_left,
704    XPointer *to,
705    int *to_left,
706    XPointer *args,
707    int num_args)
708{
709    State state = (State) conv->state;
710    XlcCharSet charset = NULL;
711    const char *ctptr;
712    char *bufptr;
713    int ctext_len, buf_len;
714    int unconv_num = 0;
715
716    ctptr = (const char *) *from;
717    bufptr = (char *) *to;
718    ctext_len = *from_left;
719    buf_len = *to_left;
720
721    while (ctext_len > 0 && buf_len > 0) {
722        if (state->ext_seg_left == 0) {
723            /* Not in the middle of an extended segment; look at next byte. */
724            unsigned char ch = *ctptr;
725            XlcCharSet ch_charset;
726
727            if (ch == XctESC) {
728                CheckResult ret =
729                    _XlcCheckCTSequence(state, &ctptr, &ctext_len);
730                if (ret == resOK)
731                    /* state has been modified. */
732                    continue;
733                if (ret == resNotInList) {
734                    /* XXX Just continue with previous charset. */
735                    unconv_num++;
736                    continue;
737                }
738            } else if (ch == XctCSI) {
739                /* XXX Simply ignore the XctLeftToRight, XctRightToLeft,
740                   XctDirectionEnd sequences for the moment. */
741                unsigned char dummy;
742                if (_XlcParseCT(&ctptr, &ctext_len, &dummy)) {
743                    unconv_num++;
744                    continue;
745                }
746            }
747
748            /* Find the charset which is responsible for this byte. */
749            ch_charset = (state->Other_charset != NULL ? state->Other_charset :
750                          (ch & 0x80 ? state->GR_charset : state->GL_charset));
751
752            /* Set the charset of this run, or continue the current run,
753               or stop the current run. */
754            if (charset) {
755                if (charset != ch_charset)
756                    break;
757            } else {
758                state->charset = charset = ch_charset;
759            }
760
761            /* We don't want to split a character into multiple pieces. */
762            if (buf_len < 6) {
763                if (charset->char_size > 0) {
764                    if (buf_len < charset->char_size)
765                        break;
766                } else {
767                    /* char_size == 0 is tricky. The code here is good only
768                       for valid UTF-8 input. */
769                    if (charset->ct_sequence[0] == XctESC
770                        && charset->ct_sequence[1] == XctOtherCoding
771                        && charset->ct_sequence[2] == 'G') {
772                        int char_size = (ch < 0xc0 ? 1 :
773                                         ch < 0xe0 ? 2 :
774                                         ch < 0xf0 ? 3 :
775                                         ch < 0xf8 ? 4 :
776                                         ch < 0xfc ? 5 :
777                                                     6);
778                        if (buf_len < char_size)
779                            break;
780                    }
781                }
782            }
783
784            *bufptr++ = *ctptr++;
785            ctext_len--;
786            buf_len--;
787        } else {
788            /* Copy as much as possible from the current extended segment
789               to the buffer. */
790            int char_size;
791
792            /* Set the charset of this run, or continue the current run,
793               or stop the current run. */
794            if (charset) {
795                if (charset != state->charset)
796                    break;
797            } else {
798                charset = state->charset;
799            }
800
801            char_size = charset->char_size;
802
803            if (state->ext_seg_left <= buf_len || char_size > 0) {
804                int n = (state->ext_seg_left <= buf_len
805                         ? state->ext_seg_left
806                         : (buf_len / char_size) * char_size);
807                memcpy(bufptr, ctptr, n);
808                ctptr += n; ctext_len -= n;
809                bufptr += n; buf_len -= n;
810                state->ext_seg_left -= n;
811            } else {
812#if UTF8_IN_EXTSEQ
813                /* char_size == 0 is tricky. The code here is good only
814                   for valid UTF-8 input. */
815                if (strcmp(charset->name, "ISO10646-1") == 0) {
816                    unsigned char ch = *ctptr;
817                    int char_size = (ch < 0xc0 ? 1 :
818                                     ch < 0xe0 ? 2 :
819                                     ch < 0xf0 ? 3 :
820                                     ch < 0xf8 ? 4 :
821                                     ch < 0xfc ? 5 :
822                                                 6);
823                    int i;
824                    if (buf_len < char_size)
825                        break;
826                    /* A small loop is faster than calling memcpy. */
827                    for (i = char_size; i > 0; i--)
828                        *bufptr++ = *ctptr++;
829                    ctext_len -= char_size;
830                    buf_len -= char_size;
831                    state->ext_seg_left -= char_size;
832                } else
833#endif
834                {
835                    /* Here ctext_len >= state->ext_seg_left > buf_len.
836                       We may be splitting a character into multiple pieces.
837                       Oh well. */
838                    int n = buf_len;
839                    memcpy(bufptr, ctptr, n);
840                    ctptr += n; ctext_len -= n;
841                    bufptr += n; buf_len -= n;
842                    state->ext_seg_left -= n;
843                }
844            }
845        }
846    }
847
848    /* 'charset' is the charset for the current run. In some cases,
849       'state->charset' contains the charset for the next run. Therefore,
850       return 'charset'.
851       'charset' may still be NULL only if no output was produced. */
852    if (num_args > 0)
853	*((XlcCharSet *) args[0]) = charset;
854
855    *from_left -= ctptr - *((const char **) from);
856    *from = (XPointer) ctptr;
857
858    *to_left -= bufptr - *((char **) to);
859    *to = (XPointer) bufptr;
860
861    return unconv_num;
862}
863
864/* from XlcNCharSet to XlcNCompoundText */
865
866static int
867cstoct(
868    XlcConv conv,
869    XPointer *from,
870    int *from_left,
871    XPointer *to,
872    int *to_left,
873    XPointer *args,
874    int num_args)
875{
876    State state = (State) conv->state;
877    XlcSide side;
878    unsigned char min_ch = 0, max_ch = 0;
879    int length, unconv_num;
880    CTInfo ct_info;
881    XlcCharSet charset;
882    const char *csptr;
883    char *ctptr;
884    int csstr_len, ct_len;
885    char *ext_segment_start;
886    int char_size;
887
888    /* One argument is required, of type XlcCharSet. */
889    if (num_args < 1)
890	return -1;
891
892    csptr = *((const char **) from);
893    ctptr = *((char **) to);
894    csstr_len = *from_left;
895    ct_len = *to_left;
896
897    charset = (XlcCharSet) args[0];
898
899    ct_info = _XlcGetCTInfoFromCharSet(charset);
900    if (ct_info == NULL)
901	return -1;
902
903    side = charset->side;
904    length = strlen(ct_info->ct_sequence);
905
906    ext_segment_start = NULL;
907
908    if (ct_info->type == XctOtherCoding) {
909        /* Output the Escape sequence for switching to the charset, and
910           reserve room now for the XctReturn sequence at the end. */
911        if (ct_len < length + 3)
912            return -1;
913
914        memcpy(ctptr, ct_info->ct_sequence, length);
915        ctptr += length;
916        ct_len -= length + 3;
917    } else
918    /* Test whether the charset is already active. */
919    if (((side == XlcGR || side == XlcGLGR)
920	 && charset != state->GR_charset)
921	|| ((side == XlcGL || side == XlcGLGR)
922	    && charset != state->GL_charset)) {
923
924        /* Output the Escape sequence for switching to the charset. */
925        if (ct_info->type == XctExtSeg) {
926            if (ct_len < length + 2 + ct_info->ext_segment_len)
927                return -1;
928
929            memcpy(ctptr, ct_info->ct_sequence, length);
930            ctptr += length;
931            ct_len -= length;
932
933            ctptr += 2;
934            ct_len -= 2;
935            ext_segment_start = ctptr;
936
937            /* The size of an extended segment must fit in 14 bits. */
938            if (ct_len > 0x3fff)
939                ct_len = 0x3fff;
940
941            memcpy(ctptr, ct_info->ext_segment, ct_info->ext_segment_len);
942            ctptr += ct_info->ext_segment_len;
943            ct_len -= ct_info->ext_segment_len;
944        } else {
945            if (ct_len < length)
946                return -1;
947
948            memcpy(ctptr, ct_info->ct_sequence, length);
949            ctptr += length;
950            ct_len -= length;
951        }
952    }
953
954    /* If the charset has side GL or GR, prepare remapping the characters
955       to the correct side. */
956    if (charset->set_size) {
957        min_ch = 0x20;
958        max_ch = 0x7f;
959        if (charset->set_size == 94) {
960            max_ch--;
961	    if (charset->char_size > 1 || side == XlcGR)
962		min_ch++;
963        }
964    }
965
966    /* Actually copy the contents. */
967    unconv_num = 0;
968    char_size = charset->char_size;
969    if (char_size == 1) {
970	while (csstr_len > 0 && ct_len > 0) {
971	    if (charset->set_size) {
972		/* The CompoundText specification says that the only
973		   control characters allowed are 0x09, 0x0a, 0x1b, 0x9b.
974		   Therefore here we eliminate other control characters. */
975		unsigned char ch = *((unsigned char *) csptr) & 0x7f;
976		if (!((ch >= min_ch && ch <= max_ch)
977		      || (side == XlcGL
978			  && (ch == 0x00 || ch == 0x09 || ch == 0x0a))
979		      || ((side == XlcGL || side == XlcGR)
980			  && (ch == 0x1b)))) {
981                    csptr++;
982                    csstr_len--;
983		    unconv_num++;
984                    continue;
985 		}
986	    }
987
988	    if (side == XlcGL)
989		*ctptr++ = *csptr++ & 0x7f;
990	    else if (side == XlcGR)
991		*ctptr++ = *csptr++ | 0x80;
992	    else
993		*ctptr++ = *csptr++;
994	    csstr_len--;
995	    ct_len--;
996	}
997    } else if (char_size > 1) {
998	while (csstr_len >= char_size && ct_len >= char_size) {
999	    if (side == XlcGL) {
1000		int i;
1001		for (i = char_size; i > 0; i--)
1002		    *ctptr++ = *csptr++ & 0x7f;
1003	    } else if (side == XlcGR) {
1004		int i;
1005		for (i = char_size; i > 0; i--)
1006		    *ctptr++ = *csptr++ | 0x80;
1007	    } else {
1008		int i;
1009		for (i = char_size; i > 0; i--)
1010		    *ctptr++ = *csptr++;
1011	    }
1012	    csstr_len -= char_size;
1013	    ct_len -= char_size;
1014	}
1015    } else {
1016        /* char_size = 0. The code here is good only for valid UTF-8 input. */
1017        if ((charset->ct_sequence[0] == XctESC
1018             && charset->ct_sequence[1] == XctOtherCoding
1019             && charset->ct_sequence[2] == 'G')
1020#if UTF8_IN_EXTSEQ
1021            || strcmp(charset->name, "ISO10646-1") == 0
1022#endif
1023           ) {
1024            while (csstr_len > 0 && ct_len > 0) {
1025                unsigned char ch = * (unsigned char *) csptr;
1026                int char_size = (ch < 0xc0 ? 1 :
1027                                 ch < 0xe0 ? 2 :
1028                                 ch < 0xf0 ? 3 :
1029                                 ch < 0xf8 ? 4 :
1030                                 ch < 0xfc ? 5 :
1031                                             6);
1032                int i;
1033                if (!(csstr_len >= char_size && ct_len >= char_size))
1034                    break;
1035                for (i = char_size; i > 0; i--)
1036                    *ctptr++ = *csptr++;
1037                csstr_len -= char_size;
1038                ct_len -= char_size;
1039            }
1040        } else {
1041            while (csstr_len > 0 && ct_len > 0) {
1042                *ctptr++ = *csptr++;
1043                csstr_len--;
1044                ct_len--;
1045            }
1046        }
1047    }
1048
1049    if (ct_info->type == XctOtherCoding) {
1050        /* Terminate with an XctReturn sequence. */
1051        ctptr[0] = XctESC;
1052        ctptr[1] = XctOtherCoding;
1053        ctptr[2] = '@';
1054        ctptr += 3;
1055    } else if (ext_segment_start != NULL) {
1056        /* Backpatch the extended segment's length. */
1057        int ext_segment_length = ctptr - ext_segment_start;
1058        *(ext_segment_start - 2) = (ext_segment_length >> 7) | 0x80;
1059        *(ext_segment_start - 1) = (ext_segment_length & 0x7f) | 0x80;
1060    } else {
1061        if (side == XlcGR || side == XlcGLGR)
1062            state->GR_charset = charset;
1063        if (side == XlcGL || side == XlcGLGR)
1064            state->GL_charset = charset;
1065    }
1066
1067    *from_left -= csptr - *((const char **) from);
1068    *from = (XPointer) csptr;
1069
1070    *to_left -= ctptr - *((char **) to);
1071    *to = (XPointer) ctptr;
1072
1073    return 0;
1074}
1075
1076/* from XlcNString to XlcNCharSet */
1077
1078static int
1079strtocs(
1080    XlcConv conv,
1081    XPointer *from,
1082    int *from_left,
1083    XPointer *to,
1084    int *to_left,
1085    XPointer *args,
1086    int num_args)
1087{
1088    State state = (State) conv->state;
1089    const char *src;
1090    char *dst;
1091    unsigned char side;
1092    int length;
1093
1094    src = (const char *) *from;
1095    dst = (char *) *to;
1096
1097    length = min(*from_left, *to_left);
1098    side = *((unsigned char *) src) & 0x80;
1099
1100    while (side == (*((unsigned char *) src) & 0x80) && length-- > 0)
1101	*dst++ = *src++;
1102
1103    *from_left -= src - (const char *) *from;
1104    *from = (XPointer) src;
1105    *to_left -= dst - (char *) *to;
1106    *to = (XPointer) dst;
1107
1108    if (num_args > 0)
1109	*((XlcCharSet *)args[0]) = (side ? state->GR_charset : state->GL_charset);
1110
1111    return 0;
1112}
1113
1114/* from XlcNCharSet to XlcNString */
1115
1116static int
1117cstostr(
1118    XlcConv conv,
1119    XPointer *from,
1120    int *from_left,
1121    XPointer *to,
1122    int *to_left,
1123    XPointer *args,
1124    int num_args)
1125{
1126    State state = (State) conv->state;
1127    const char *csptr;
1128    char *string_ptr;
1129    int csstr_len, str_len;
1130    unsigned char ch;
1131    int unconv_num = 0;
1132
1133    /* This converter can only convert from ISO8859-1:GL and ISO8859-1:GR. */
1134    if (num_args < 1
1135	|| !((XlcCharSet) args[0] == state->GL_charset
1136	     || (XlcCharSet) args[0] == state->GR_charset))
1137	return -1;
1138
1139    csptr = *((const char **) from);
1140    string_ptr = *((char **) to);
1141    csstr_len = *from_left;
1142    str_len = *to_left;
1143
1144    while (csstr_len > 0 && str_len > 0) {
1145	ch = *((unsigned char *) csptr++);
1146	csstr_len--;
1147	/* Citing ICCCM: "STRING as a type specifies the ISO Latin-1 character
1148	   set plus the control characters TAB and NEWLINE." */
1149	if ((ch < 0x20 && ch != 0x00 && ch != 0x09 && ch != 0x0a)
1150	    || (ch >= 0x7f && ch < 0xa0)) {
1151	    unconv_num++;
1152	    continue;
1153	}
1154	*((unsigned char *) string_ptr++) = ch;
1155	str_len--;
1156    }
1157
1158    *from_left -= csptr - *((const char **) from);
1159    *from = (XPointer) csptr;
1160
1161    *to_left -= string_ptr - *((char **) to);
1162    *to = (XPointer) string_ptr;
1163
1164    return unconv_num;
1165}
1166
1167
1168static XlcConv
1169create_conv(
1170    XlcConvMethods methods)
1171{
1172    XlcConv conv;
1173
1174    conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + sizeof(StateRec));
1175    if (conv == NULL)
1176	return (XlcConv) NULL;
1177
1178    conv->state = (XPointer) &conv[1];
1179
1180    conv->methods = methods;
1181
1182    init_state(conv);
1183
1184    return conv;
1185}
1186
1187static void
1188close_converter(
1189    XlcConv conv)
1190{
1191    /* conv->state is allocated together with conv, free both at once.  */
1192    Xfree((char *) conv);
1193}
1194
1195
1196static XlcConvMethodsRec cttocs_methods = {
1197    close_converter,
1198    cttocs,
1199    init_state
1200};
1201
1202static XlcConv
1203open_cttocs(
1204    XLCd from_lcd,
1205    const char *from_type,
1206    XLCd to_lcd,
1207    const char *to_type)
1208{
1209    return create_conv(&cttocs_methods);
1210}
1211
1212
1213static XlcConvMethodsRec cstoct_methods = {
1214    close_converter,
1215    cstoct,
1216    init_state
1217};
1218
1219static XlcConv
1220open_cstoct(
1221    XLCd from_lcd,
1222    const char *from_type,
1223    XLCd to_lcd,
1224    const char *to_type)
1225{
1226    return create_conv(&cstoct_methods);
1227}
1228
1229
1230static XlcConvMethodsRec strtocs_methods = {
1231    close_converter,
1232    strtocs,
1233    init_state
1234};
1235
1236static XlcConv
1237open_strtocs(
1238    XLCd from_lcd,
1239    const char *from_type,
1240    XLCd to_lcd,
1241    const char *to_type)
1242{
1243    return create_conv(&strtocs_methods);
1244}
1245
1246
1247static XlcConvMethodsRec cstostr_methods = {
1248    close_converter,
1249    cstostr,
1250    init_state
1251};
1252
1253static XlcConv
1254open_cstostr(
1255    XLCd from_lcd,
1256    const char *from_type,
1257    XLCd to_lcd,
1258    const char *to_type)
1259{
1260    return create_conv(&cstostr_methods);
1261}
1262
1263
1264/* =========================== Initialization =========================== */
1265
1266Bool
1267_XlcInitCTInfo(void)
1268{
1269    if (ct_list == NULL) {
1270        const CTDataRec *ct_data;
1271        int num;
1272        XlcCharSet charset;
1273
1274        /* Initialize ct_list.  */
1275
1276	num = sizeof(default_ct_data) / sizeof(CTDataRec);
1277	for (ct_data = default_ct_data; num > 0; ct_data++, num--) {
1278	    charset = _XlcAddCT(ct_data->name, ct_data->ct_sequence);
1279            if (charset == NULL)
1280                continue;
1281			if (strncmp(charset->ct_sequence, "\x1b\x25\x2f", 3) != 0)
1282				charset->source = CSsrcStd;
1283			else
1284				charset->source = CSsrcXLC;
1285	}
1286
1287        /* Register CompoundText and CharSet converters.  */
1288
1289        _XlcSetConverter((XLCd) NULL, XlcNCompoundText,
1290                         (XLCd) NULL, XlcNCharSet,
1291                         open_cttocs);
1292        _XlcSetConverter((XLCd) NULL, XlcNString,
1293                         (XLCd) NULL, XlcNCharSet,
1294                         open_strtocs);
1295
1296        _XlcSetConverter((XLCd) NULL, XlcNCharSet,
1297                         (XLCd) NULL, XlcNCompoundText,
1298                         open_cstoct);
1299        _XlcSetConverter((XLCd) NULL, XlcNCharSet,
1300                         (XLCd) NULL, XlcNString,
1301                         open_cstostr);
1302    }
1303
1304    return True;
1305}
1306