charset.c revision a8fdb4bc
1/*
2Copyright (c) 2001 by Juliusz Chroboczek
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22/* $XFree86: xc/programs/luit/charset.c,v 1.8 2003/12/22 17:48:12 tsi Exp $ */
23
24#include <stdlib.h>
25#include <stdio.h>
26#include <string.h>
27#include <ctype.h>
28#include <X11/fonts/fontenc.h>
29#include "other.h"
30#include "charset.h"
31#include "parser.h"
32
33#ifndef NULL
34#define NULL 0
35#endif
36
37static unsigned int
38IdentityRecode(unsigned int n, CharsetPtr self)
39{
40    return n;
41}
42
43#ifdef UNUSED
44static int
45IdentityReverse(unsigned int n, CharsetPtr self)
46{
47#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
48    switch(self->type) {
49    case T_94:
50    case T_96:
51        if (IS_GL(n)) return n; else return -1;
52    case T_128:
53        if (n < 0x80) return n; else return -1;
54    case T_9494:
55    case T_9696:
56        if(IS_GL(n>>8) && IS_GL(n&0xFF))
57            return n;
58        else
59            return -1;
60    case T_94192:
61        if(IS_GL(n>>8) && IS_GL(n&0x7F))
62            return n;
63        else
64            return -1;
65    default:
66        abort();
67    }
68#undef IS_GL
69}
70#endif
71
72static int
73NullReverse(unsigned int n, CharsetPtr self)
74{
75    return -1;
76}
77
78CharsetRec Unknown94Charset =
79{ "Unknown (94)", T_94, 0, IdentityRecode, NullReverse, NULL, NULL};
80CharsetRec Unknown96Charset =
81{ "Unknown (96)", T_96, 0, IdentityRecode, NullReverse, NULL, NULL};
82CharsetRec Unknown9494Charset =
83{ "Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, NULL, NULL};
84CharsetRec Unknown9696Charset =
85{ "Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, NULL, NULL};
86
87typedef struct _FontencCharset {
88    char *name;
89    int type;
90    unsigned char final;
91    char *xlfd;
92    int shift;
93    FontMapPtr mapping;
94    FontMapReversePtr reverse;
95} FontencCharsetRec, *FontencCharsetPtr;
96
97FontencCharsetRec fontencCharsets[] = {
98    {"ISO 646 (1973)", T_94, '@', "iso646.1973-0", 0x00, NULL, NULL},
99    {"ASCII", T_94, 'B', "iso8859-1", 0x00, NULL, NULL},
100    {"JIS X 0201:GL", T_94, 'J', "jisx0201.1976-0", 0x00, NULL, NULL},
101    {"JIS X 0201:GR", T_94, 'I', "jisx0201.1976-0", 0x80, NULL, NULL},
102    {"DEC Special", T_94, '0', "dec-special", 0x00, NULL, NULL},
103    {"DEC Technical", T_94, '>', "dec-dectech", 0x00, NULL, NULL},
104
105    {"ISO 8859-1", T_96, 'A', "iso8859-1", 0x80, NULL, NULL},
106    {"ISO 8859-2", T_96, 'B', "iso8859-2", 0x80, NULL, NULL},
107    {"ISO 8859-3", T_96, 'C', "iso8859-3", 0x80, NULL, NULL},
108    {"ISO 8859-4", T_96, 'D', "iso8859-4", 0x80, NULL, NULL},
109    {"ISO 8859-5", T_96, 'L', "iso8859-5", 0x80, NULL, NULL},
110    {"ISO 8859-6", T_96, 'G', "iso8859-6", 0x80, NULL, NULL},
111    {"ISO 8859-7", T_96, 'F', "iso8859-7", 0x80, NULL, NULL},
112    {"ISO 8859-8", T_96, 'H', "iso8859-8", 0x80, NULL, NULL},
113    {"ISO 8859-9", T_96, 'M', "iso8859-9", 0x80, NULL, NULL},
114    {"ISO 8859-10", T_96, 'V', "iso8859-10", 0x80, NULL, NULL},
115    {"ISO 8859-11", T_96, 'T', "iso8859-11", 0x80, NULL, NULL},
116    {"TIS 620", T_96, 'T', "iso8859-11", 0x80, NULL, NULL},
117    {"ISO 8859-13", T_96, 'Y', "iso8859-13", 0x80, NULL, NULL},
118    {"ISO 8859-14", T_96, '_', "iso8859-14", 0x80, NULL, NULL},
119    {"ISO 8859-15", T_96, 'b', "iso8859-15", 0x80, NULL, NULL},
120    {"ISO 8859-16", T_96, 'f', "iso8859-16", 0x80, NULL, NULL},
121    {"KOI8-E", T_96, '@', "koi8-e", 0x80, NULL, NULL},
122    {"TCVN", T_96, 'Z', "tcvn-0", 0x80, NULL, NULL},
123
124    {"GB 2312", T_9494, 'A', "gb2312.1980-0", 0x0000, NULL, NULL},
125    {"JIS X 0208", T_9494, 'B', "jisx0208.1990-0", 0x0000, NULL, NULL},
126    {"KSC 5601", T_9494, 'C', "ksc5601.1987-0", 0x0000, NULL, NULL},
127    {"JIS X 0212", T_9494, 'D', "jisx0212.1990-0", 0x0000, NULL, NULL},
128
129    {"GB 2312", T_9696, 'A', "gb2312.1980-0", 0x0000, NULL, NULL},
130    {"JIS X 0208", T_9696, 'B', "jisx0208.1990-0", 0x0000, NULL, NULL},
131    {"KSC 5601", T_9696, 'C', "ksc5601.1987-0", 0x0000, NULL, NULL},
132    {"JIS X 0212", T_9696, 'D', "jisx0212.1990-0", 0x0000, NULL, NULL},
133
134    {"KOI8-R", T_128, 0, "koi8-r", 0x80, NULL, NULL},
135    {"KOI8-U", T_128, 0, "koi8-u", 0x80, NULL, NULL},
136    {"KOI8-RU", T_128, 0, "koi8-ru", 0x80, NULL, NULL},
137    {"CP 1252", T_128, 0, "microsoft-cp1252", 0x80, NULL, NULL},
138    {"CP 1251", T_128, 0, "microsoft-cp1251", 0x80, NULL, NULL},
139    {"CP 1250", T_128, 0, "microsoft-cp1250", 0x80, NULL, NULL},
140
141    {"CP 437", T_128, 0, "ibm-cp437", 0x80, NULL, NULL},
142    {"CP 850", T_128, 0, "ibm-cp850", 0x80, NULL, NULL},
143    {"CP 866", T_128, 0, "ibm-cp866", 0x80, NULL, NULL},
144
145    {"Big 5", T_94192, 0, "big5.eten-0", 0x8000, NULL, NULL},
146    {NULL, 0, 0, NULL, 0, NULL, NULL}
147};
148
149typedef struct _OtherCharset {
150    char *name;
151    int (*init)(OtherStatePtr);
152    unsigned int (*mapping)(unsigned int, OtherStatePtr);
153    unsigned int (*reverse)(unsigned int, OtherStatePtr);
154    int (*stack)(unsigned char, OtherStatePtr);
155} OtherCharsetRec, *OtherCharsetPtr;
156
157OtherCharsetRec otherCharsets[] = {
158    {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk},
159    {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8},
160    {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis},
161    {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs},
162    {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030},
163    {NULL, NULL, NULL, NULL, NULL}
164};
165
166static int
167compare(const char *s, const char *t)
168{
169    while(*s || *t) {
170        if(*s && (isspace(*s) || *s == '-' || *s == '_'))
171            s++;
172        else if(*t && (isspace(*t) || *t == '-' || *t == '_'))
173            t++;
174        else if(*s && *t && tolower(*s) == tolower(*t)) {
175            s++;
176            t++;
177        } else
178            return 1;
179    }
180    return 0;
181}
182
183static unsigned int
184FontencCharsetRecode(unsigned int n, CharsetPtr self)
185{
186    FontencCharsetPtr fc = (FontencCharsetPtr)(self->data);
187
188    return FontEncRecode(n + fc->shift, fc->mapping);
189}
190
191static int
192FontencCharsetReverse(unsigned int i, CharsetPtr self)
193{
194    FontencCharsetPtr fc = (FontencCharsetPtr)(self->data);
195    int n;
196
197    n = fc->reverse->reverse(i, fc->reverse->data);
198    if(n == 0 || n < fc->shift)
199        return -1;
200    else
201        n -= fc->shift;
202
203#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
204    switch(self->type) {
205    case T_94: case T_96:
206        if (IS_GL(n)) return n; else return -1;
207        break;
208    case T_128:
209        if (n < 0x80) return n; else return -1;
210    case T_9494: case T_9696:
211        if(IS_GL(n>>8) && IS_GL(n&0xFF))
212            return n;
213        else
214            return -1;
215        break;
216    case T_94192:
217        if(IS_GL(n>>8) && IS_GL(n&0x7F))
218            return n;
219        else
220            return -1;
221        break;
222    default:
223        abort();
224    }
225#undef IS_GL
226}
227
228
229static CharsetPtr cachedCharsets = NULL;
230
231static CharsetPtr
232getCachedCharset(unsigned char final, int type, const char *name)
233{
234    CharsetPtr c;
235    for(c = cachedCharsets; c; c = c->next) {
236        if(((c->type == type && c->final == final) ||
237            (name && !compare(c->name, name))) &&
238           (c->type != T_FAILED))
239            return c;
240    }
241    return NULL;
242}
243
244static void
245cacheCharset(CharsetPtr c) {
246    c->next = cachedCharsets;
247    cachedCharsets = c;
248}
249
250static CharsetPtr
251getFontencCharset(unsigned char final, int type, const char *name)
252{
253    FontencCharsetPtr fc;
254    CharsetPtr c;
255    FontMapPtr mapping;
256    FontMapReversePtr reverse;
257
258    fc = fontencCharsets;
259    while(fc->name) {
260        if(((fc->type == type && fc->final == final) ||
261            (name && !compare(fc->name, name))) &&
262           (fc->type != T_FAILED))
263            break;
264        fc++;
265    }
266
267    if(!fc->name)
268        return NULL;
269
270    c = malloc(sizeof(CharsetRec));
271    if(c == NULL)
272        return NULL;
273
274    mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL);
275    if(!mapping) {
276        fc->type = T_FAILED;
277        return NULL;
278    }
279
280    reverse = FontMapReverse(mapping);
281    if(!reverse) {
282        fc->type = T_FAILED;
283        return NULL;
284    }
285
286    fc->mapping = mapping;
287    fc->reverse = reverse;
288
289    c->name = fc->name;
290    c->type = fc->type;
291    c->final = fc->final;
292    c->recode = FontencCharsetRecode;
293    c->reverse = FontencCharsetReverse;
294    c->data = fc;
295
296    cacheCharset(c);
297    return c;
298}
299
300static CharsetPtr
301getOtherCharset(const char *name)
302{
303    OtherCharsetPtr fc;
304    CharsetPtr c;
305    OtherStatePtr s;
306
307    fc = otherCharsets;
308    while(fc->name) {
309        if(name && !compare(fc->name, name))
310            break;
311        fc++;
312    }
313
314    if(!fc->name)
315        return NULL;
316
317    c = malloc(sizeof(CharsetRec));
318    if(c == NULL)
319        return NULL;
320
321    s = malloc(sizeof(OtherState));
322    if(s == NULL) {
323        free(c);
324        return NULL;
325    }
326
327    c->name = fc->name;
328    c->type = T_OTHER;
329    c->final = 0;
330    c->data = fc;
331    c->other_recode = fc->mapping;
332    c->other_reverse = fc->reverse;
333    c->other_stack = fc->stack;
334    c->other_aux = s;
335
336    if(!fc->init(s)) {
337        c->type = T_FAILED;
338        return NULL;
339    }
340
341    cacheCharset(c);
342    return c;
343}
344
345CharsetPtr
346getUnknownCharset(int type)
347{
348    switch(type) {
349    case T_94: return &Unknown94Charset;
350    case T_96: return &Unknown96Charset;
351    case T_9494: return &Unknown9494Charset;
352    case T_9696: return &Unknown9696Charset;
353    default: return &Unknown94Charset;
354    }
355}
356
357CharsetPtr
358getCharset(unsigned char final, int type)
359{
360    CharsetPtr c;
361
362    c = getCachedCharset(final, type, NULL);
363    if(c)
364        return c;
365
366    c = getFontencCharset(final, type, NULL);
367    if(c)
368        return c;
369
370    return getUnknownCharset(type);
371}
372
373CharsetPtr
374getCharsetByName(const char *name)
375{
376    CharsetPtr c;
377
378    if(name == NULL)
379        return getUnknownCharset(T_94);
380
381    c = getCachedCharset(0, 0, name);
382    if(c)
383        return c;
384
385    c = getFontencCharset(0, 0, name);
386    if(c)
387        return c;
388
389    c = getOtherCharset(name);
390    if(c)
391        return c;
392
393    return getUnknownCharset(T_94);
394}
395
396const LocaleCharsetRec localeCharsets[] = {
397    { "C", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
398    { "POSIX", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
399    { "ISO8859-1", 0, 2, "ASCII", NULL, "ISO 8859-1", NULL, NULL},
400    { "ISO8859-2", 0, 2, "ASCII", NULL, "ISO 8859-2", NULL, NULL},
401    { "ISO8859-3", 0, 2, "ASCII", NULL, "ISO 8859-3", NULL, NULL},
402    { "ISO8859-4", 0, 2, "ASCII", NULL, "ISO 8859-4", NULL, NULL},
403    { "ISO8859-5", 0, 2, "ASCII", NULL, "ISO 8859-5", NULL, NULL},
404    { "ISO8859-6", 0, 2, "ASCII", NULL, "ISO 8859-6", NULL, NULL},
405    { "ISO8859-7", 0, 2, "ASCII", NULL, "ISO 8859-7", NULL, NULL},
406    { "ISO8859-8", 0, 2, "ASCII", NULL, "ISO 8859-8", NULL, NULL},
407    { "ISO8859-9", 0, 2, "ASCII", NULL, "ISO 8859-9", NULL, NULL},
408    { "ISO8859-10", 0, 2, "ASCII", NULL, "ISO 8859-10", NULL, NULL},
409    { "ISO8859-11", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL},
410    { "TIS620", 0, 2, "ASCII", NULL, "ISO 8859-11", NULL, NULL},
411    { "ISO8859-13", 0, 2, "ASCII", NULL, "ISO 8859-13", NULL, NULL},
412    { "ISO8859-14", 0, 2, "ASCII", NULL, "ISO 8859-14", NULL, NULL},
413    { "ISO8859-15", 0, 2, "ASCII", NULL, "ISO 8859-15", NULL, NULL},
414    { "ISO8859-16", 0, 2, "ASCII", NULL, "ISO 8859-16", NULL, NULL},
415    { "KOI8-R", 0, 2, "ASCII", NULL, "KOI8-R", NULL, NULL},
416    { "CP1251", 0, 2, "ASCII", NULL, "CP 1251", NULL, NULL},
417    { "TCVN", 0, 2, "ASCII", NULL, "TCVN", NULL, NULL},
418    { "eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL},
419    { "GB2312", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL},
420    { "eucJP", 0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL},
421    { "eucKR", 0, 1, "ASCII", "KSC 5601", NULL, NULL, NULL},
422    { "eucCN", 0, 1, "ASCII", "GB 2312", NULL, NULL, NULL},
423    { "Big5", 0, 1, "ASCII", "Big 5", NULL, NULL, NULL},
424    { "gbk", 0, 1, NULL, NULL, NULL, NULL, "GBK"},
425    { "UTF-8", 0, 1, NULL, NULL, NULL, NULL, "UTF-8"},
426    { "SJIS", 0, 1, NULL, NULL, NULL, NULL, "SJIS"},
427    { "Big5-HKSCS", 0, 1, NULL, NULL, NULL, NULL, "BIG5-HKSCS"},
428    { "gb18030", 0, 1, NULL, NULL, NULL, NULL, "GB18030"},
429    { NULL, 0, 0, NULL, NULL, NULL, NULL, NULL}
430};
431
432void
433reportCharsets(void)
434{
435    const LocaleCharsetRec *p;
436    FontencCharsetPtr q;
437    printf("Known locale encodings:\n\n");
438    for(p = localeCharsets; p->name; p++) {
439        if(p->other) {
440            printf("  %s (non-ISO-2022 encoding)\n", p->other);
441	    continue;
442        }
443        printf("  %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr);
444        if(p->g0) printf(", G0: %s", p->g0);
445        if(p->g1) printf(", G1: %s", p->g1);
446        if(p->g2) printf(", G2: %s", p->g2);
447        if(p->g3) printf(", G3: %s", p->g3);
448        printf("\n");
449    }
450
451    printf("\n\nKnown charsets (not all may be available):\n\n");
452    for(q = fontencCharsets; q->name; q++)
453        printf("  %s%s\n",
454               q->name, q->final?" (ISO 2022)":"");
455}
456
457int
458getLocaleState(const char *locale, char *charset,
459               int *gl_return, int *gr_return,
460               CharsetPtr *g0_return, CharsetPtr *g1_return,
461               CharsetPtr *g2_return, CharsetPtr *g3_return,
462               CharsetPtr *other_return)
463{
464    char *resolved = NULL;
465    const LocaleCharsetRec *p;
466
467    if(!charset) {
468        resolved = resolveLocale(locale);
469        if(!resolved)
470            return -1;
471        charset = strrchr(resolved, '.');
472        if(charset)
473            charset++;
474        else
475            charset = resolved;
476    }
477
478    for(p = localeCharsets; p->name; p++) {
479        if(compare(p->name, charset) == 0)
480            break;
481    }
482
483    if(p->name == NULL) {
484	if (resolved != 0)
485	    free(resolved);
486        return -1;
487    }
488
489    *gl_return = p->gl;
490    *gr_return = p->gr;
491    *g0_return = getCharsetByName(p->g0);
492    *g1_return = getCharsetByName(p->g1);
493    *g2_return = getCharsetByName(p->g2);
494    *g3_return = getCharsetByName(p->g3);
495    if(p->other)
496        *other_return = getCharsetByName(p->other);
497    else
498        *other_return = NULL;
499    return 0;
500}
501
502