charset.c revision 77683534
1/*
2Copyright (c) 2001 by Juliusz Chroboczek
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifdef HAVE_CONFIG_H
24# include "config.h"
25#endif
26
27#include <stdlib.h>
28#include <stdio.h>
29#include <string.h>
30#include <ctype.h>
31
32#include "sys.h"
33#include "other.h"
34#include "charset.h"
35#include "parser.h"
36
37#ifndef NULL
38#define NULL 0
39#endif
40
41static unsigned int
42IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED)
43{
44    return n;
45}
46
47#ifdef UNUSED
48static int
49IdentityReverse(unsigned int n, const CharsetRec * self)
50{
51#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
52    switch (self->type) {
53    case T_94:
54    case T_96:
55	if (IS_GL(n))
56	    return n;
57	else
58	    return -1;
59    case T_128:
60	if (n < 0x80)
61	    return n;
62	else
63	    return -1;
64    case T_9494:
65    case T_9696:
66	if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
67	    return n;
68	else
69	    return -1;
70    case T_94192:
71	if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
72	    return n;
73	else
74	    return -1;
75    default:
76	abort();
77	/* NOTREACHED */
78    }
79#undef IS_GL
80}
81#endif
82
83static int
84NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED)
85{
86    return -1;
87}
88
89static const CharsetRec Unknown94Charset =
90{"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
91static const CharsetRec Unknown96Charset =
92{"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
93static const CharsetRec Unknown9494Charset =
94{"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
95static const CharsetRec Unknown9696Charset =
96{"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
97
98typedef struct _FontencCharset {
99    const char *name;
100    int type;
101    unsigned char final;
102    const char *xlfd;
103    unsigned shift;
104    FontMapPtr mapping;
105    FontMapReversePtr reverse;
106} FontencCharsetRec, *FontencCharsetPtr;
107/* *INDENT-OFF* */
108static FontencCharsetRec fontencCharsets[] =
109{
110    {"ISO 646 (1973)", T_94,    '@', "iso646.1973-0",    0x00,   0, 0},
111    {"ASCII",          T_94,    'B', "iso8859-1",        0x00,   0, 0},
112    {"JIS X 0201:GL",  T_94,    'J', "jisx0201.1976-0",  0x00,   0, 0},
113    {"JIS X 0201:GR",  T_94,    'I', "jisx0201.1976-0",  0x80,   0, 0},
114    {"DEC Special",    T_94,    '0', "dec-special",      0x00,   0, 0},
115    {"DEC Technical",  T_94,    '>', "dec-dectech",      0x00,   0, 0},
116
117    {"ISO 8859-1",     T_96,    'A', "iso8859-1",        0x80,   0, 0},
118    {"ISO 8859-2",     T_96,    'B', "iso8859-2",        0x80,   0, 0},
119    {"ISO 8859-3",     T_96,    'C', "iso8859-3",        0x80,   0, 0},
120    {"ISO 8859-4",     T_96,    'D', "iso8859-4",        0x80,   0, 0},
121    {"ISO 8859-5",     T_96,    'L', "iso8859-5",        0x80,   0, 0},
122    {"ISO 8859-6",     T_96,    'G', "iso8859-6",        0x80,   0, 0},
123    {"ISO 8859-7",     T_96,    'F', "iso8859-7",        0x80,   0, 0},
124    {"ISO 8859-8",     T_96,    'H', "iso8859-8",        0x80,   0, 0},
125    {"ISO 8859-9",     T_96,    'M', "iso8859-9",        0x80,   0, 0},
126    {"ISO 8859-10",    T_96,    'V', "iso8859-10",       0x80,   0, 0},
127    {"ISO 8859-11",    T_96,    'T', "iso8859-11",       0x80,   0, 0},
128    {"TIS 620",        T_96,    'T', "iso8859-11",       0x80,   0, 0},
129    {"ISO 8859-13",    T_96,    'Y', "iso8859-13",       0x80,   0, 0},
130    {"ISO 8859-14",    T_96,    '_', "iso8859-14",       0x80,   0, 0},
131    {"ISO 8859-15",    T_96,    'b', "iso8859-15",       0x80,   0, 0},
132    {"ISO 8859-16",    T_96,    'f', "iso8859-16",       0x80,   0, 0},
133    {"KOI8-E",         T_96,    '@', "koi8-e",           0x80,   0, 0},
134    {"TCVN",           T_96,    'Z', "tcvn-0",           0x80,   0, 0},
135
136    {"GB 2312",        T_9494,  'A', "gb2312.1980-0",    0x0000, 0, 0},
137    {"JIS X 0208",     T_9494,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
138    {"KSC 5601",       T_9494,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
139    {"JIS X 0212",     T_9494,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
140
141    {"GB 2312",        T_9696,  'A', "gb2312.1980-0",    0x0000, 0, 0},
142    {"JIS X 0208",     T_9696,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
143    {"KSC 5601",       T_9696,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
144    {"JIS X 0212",     T_9696,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
145
146    {"KOI8-R",         T_128,   0,   "koi8-r",           0x80,   0, 0},
147    {"KOI8-U",         T_128,   0,   "koi8-u",           0x80,   0, 0},
148    {"KOI8-RU",        T_128,   0,   "koi8-ru",          0x80,   0, 0},
149    {"CP 1252",        T_128,   0,   "microsoft-cp1252", 0x80,   0, 0},
150    {"CP 1251",        T_128,   0,   "microsoft-cp1251", 0x80,   0, 0},
151    {"CP 1250",        T_128,   0,   "microsoft-cp1250", 0x80,   0, 0},
152
153    {"CP 437",         T_128,   0,   "ibm-cp437",        0x80,   0, 0},
154    {"CP 850",         T_128,   0,   "ibm-cp850",        0x80,   0, 0},
155    {"CP 866",         T_128,   0,   "ibm-cp866",        0x80,   0, 0},
156
157    {"Big 5",          T_94192, 0,   "big5.eten-0",      0x8000, 0, 0},
158    {0,                0,       0,   0,                  0,      0, 0}
159};
160/* *INDENT-ON* */
161
162typedef struct _OtherCharset {
163    const char *name;
164    int (*init) (OtherStatePtr);
165    unsigned int (*mapping) (unsigned int, OtherStatePtr);
166    unsigned int (*reverse) (unsigned int, OtherStatePtr);
167    int (*stack) (unsigned, OtherStatePtr);
168} OtherCharsetRec, *OtherCharsetPtr;
169
170static const OtherCharsetRec otherCharsets[] =
171{
172    {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk},
173    {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8},
174    {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis},
175    {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs},
176    {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030},
177    {0, 0, 0, 0, 0}
178};
179
180static int
181compare(const char *s, const char *t)
182{
183    while (*s || *t) {
184	if (*s && (isspace(UChar(*s)) || *s == '-' || *s == '_'))
185	    s++;
186	else if (*t && (isspace(UChar(*t)) || *t == '-' || *t == '_'))
187	    t++;
188	else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
189	    s++;
190	    t++;
191	} else
192	    return 1;
193    }
194    return 0;
195}
196
197static unsigned int
198FontencCharsetRecode(unsigned int n, const CharsetRec * self)
199{
200    const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
201
202    return FontEncRecode(n + fc->shift, fc->mapping);
203}
204
205static int
206FontencCharsetReverse(unsigned int i, const CharsetRec * self)
207{
208    const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
209    unsigned n;
210
211    n = fc->reverse->reverse(i, fc->reverse->data);
212    if (n == 0 || n < fc->shift)
213	return -1;
214    else
215	n -= fc->shift;
216
217#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
218    switch (self->type) {
219    case T_94:
220    case T_96:
221	if (IS_GL(n))
222	    return (int) n;
223	else
224	    return -1;
225    case T_128:
226	if (n < 0x80)
227	    return (int) n;
228	else
229	    return -1;
230    case T_9494:
231    case T_9696:
232	if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
233	    return (int) n;
234	else
235	    return -1;
236    case T_94192:
237	if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
238	    return (int) n;
239	else
240	    return -1;
241    default:
242	abort();
243	/* NOTREACHED */
244    }
245#undef IS_GL
246}
247
248static CharsetPtr cachedCharsets = NULL;
249
250static CharsetPtr
251getCachedCharset(unsigned final, int type, const char *name)
252{
253    CharsetPtr c;
254    for (c = cachedCharsets; c; c = c->next) {
255	if (((c->type == type && c->final == final) ||
256	     (name && !compare(c->name, name))) &&
257	    (c->type != T_FAILED))
258	    return c;
259    }
260    return NULL;
261}
262
263static void
264cacheCharset(CharsetPtr c)
265{
266    c->next = cachedCharsets;
267    cachedCharsets = c;
268}
269
270static CharsetPtr
271getFontencCharset(unsigned final, int type, const char *name)
272{
273    FontencCharsetPtr fc;
274    CharsetPtr c;
275    FontMapPtr mapping;
276    FontMapReversePtr reverse;
277
278    fc = fontencCharsets;
279    while (fc->name) {
280	if (((fc->type == type && fc->final == final) ||
281	     (name && !compare(fc->name, name))) &&
282	    (fc->type != T_FAILED))
283	    break;
284	fc++;
285    }
286
287    if (!fc->name)
288	return NULL;
289
290    c = malloc(sizeof(CharsetRec));
291    if (c == NULL)
292	return NULL;
293
294    mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL);
295    if (!mapping) {
296	fc->type = T_FAILED;
297	return NULL;
298    }
299
300    reverse = FontMapReverse(mapping);
301    if (!reverse) {
302	fc->type = T_FAILED;
303	return NULL;
304    }
305
306    fc->mapping = mapping;
307    fc->reverse = reverse;
308
309    c->name = fc->name;
310    c->type = fc->type;
311    c->final = fc->final;
312    c->recode = FontencCharsetRecode;
313    c->reverse = FontencCharsetReverse;
314    c->data = fc;
315
316    cacheCharset(c);
317    return c;
318}
319
320static CharsetPtr
321getOtherCharset(const char *name)
322{
323    const OtherCharsetRec *fc;
324    CharsetPtr c;
325    OtherStatePtr s;
326
327    fc = otherCharsets;
328    while (fc->name) {
329	if (name && !compare(fc->name, name))
330	    break;
331	fc++;
332    }
333
334    if (!fc->name)
335	return NULL;
336
337    c = malloc(sizeof(CharsetRec));
338    if (c == NULL)
339	return NULL;
340
341    s = malloc(sizeof(OtherState));
342    if (s == NULL) {
343	free(c);
344	return NULL;
345    }
346
347    c->name = fc->name;
348    c->type = T_OTHER;
349    c->final = 0;
350    c->data = fc;
351    c->other_recode = fc->mapping;
352    c->other_reverse = fc->reverse;
353    c->other_stack = fc->stack;
354    c->other_aux = s;
355
356    if (!fc->init(s)) {
357	c->type = T_FAILED;
358	return NULL;
359    }
360
361    cacheCharset(c);
362    return c;
363}
364
365const CharsetRec *
366getUnknownCharset(int type)
367{
368    switch (type) {
369    case T_94:
370	return &Unknown94Charset;
371    case T_96:
372	return &Unknown96Charset;
373    case T_9494:
374	return &Unknown9494Charset;
375    case T_9696:
376	return &Unknown9696Charset;
377    default:
378	return &Unknown94Charset;
379    }
380}
381
382const CharsetRec *
383getCharset(unsigned final, int type)
384{
385    const CharsetRec *c;
386
387    c = getCachedCharset(final, type, NULL);
388    if (c)
389	return c;
390
391    c = getFontencCharset(final, type, NULL);
392    if (c)
393	return c;
394
395    return getUnknownCharset(type);
396}
397
398const CharsetRec *
399getCharsetByName(const char *name)
400{
401    const CharsetRec *c;
402
403    if (name == NULL)
404	return getUnknownCharset(T_94);
405
406    c = getCachedCharset(0, 0, name);
407    if (c)
408	return c;
409
410    c = getFontencCharset(0, 0, name);
411    if (c)
412	return c;
413
414    c = getOtherCharset(name);
415    if (c)
416	return c;
417
418    return getUnknownCharset(T_94);
419}
420/* *INDENT-OFF* */
421static const LocaleCharsetRec localeCharsets[] =
422{
423    {"C",          0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
424    {"POSIX",      0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
425    {"ISO8859-1",  0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
426    {"ISO8859-2",  0, 2, "ASCII", NULL,         "ISO 8859-2",    NULL,         NULL},
427    {"ISO8859-3",  0, 2, "ASCII", NULL,         "ISO 8859-3",    NULL,         NULL},
428    {"ISO8859-4",  0, 2, "ASCII", NULL,         "ISO 8859-4",    NULL,         NULL},
429    {"ISO8859-5",  0, 2, "ASCII", NULL,         "ISO 8859-5",    NULL,         NULL},
430    {"ISO8859-6",  0, 2, "ASCII", NULL,         "ISO 8859-6",    NULL,         NULL},
431    {"ISO8859-7",  0, 2, "ASCII", NULL,         "ISO 8859-7",    NULL,         NULL},
432    {"ISO8859-8",  0, 2, "ASCII", NULL,         "ISO 8859-8",    NULL,         NULL},
433    {"ISO8859-9",  0, 2, "ASCII", NULL,         "ISO 8859-9",    NULL,         NULL},
434    {"ISO8859-10", 0, 2, "ASCII", NULL,         "ISO 8859-10",   NULL,         NULL},
435    {"ISO8859-11", 0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
436    {"TIS620",     0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
437    {"ISO8859-13", 0, 2, "ASCII", NULL,         "ISO 8859-13",   NULL,         NULL},
438    {"ISO8859-14", 0, 2, "ASCII", NULL,         "ISO 8859-14",   NULL,         NULL},
439    {"ISO8859-15", 0, 2, "ASCII", NULL,         "ISO 8859-15",   NULL,         NULL},
440    {"ISO8859-16", 0, 2, "ASCII", NULL,         "ISO 8859-16",   NULL,         NULL},
441    {"KOI8-R",     0, 2, "ASCII", NULL,         "KOI8-R",        NULL,         NULL},
442    {"CP1251",     0, 2, "ASCII", NULL,         "CP 1251",       NULL,         NULL},
443    {"TCVN",       0, 2, "ASCII", NULL,         "TCVN",          NULL,         NULL},
444    {"eucCN",      0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
445    {"GB2312",     0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
446    {"eucJP",      0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL},
447    {"eucKR",      0, 1, "ASCII", "KSC 5601",   NULL,            NULL,         NULL},
448    {"eucCN",      0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
449    {"Big5",       0, 1, "ASCII", "Big 5",      NULL,            NULL,         NULL},
450    {"gbk",        0, 1, NULL,    NULL,         NULL,            NULL,         "GBK"},
451    {"UTF-8",      0, 1, NULL,    NULL,         NULL,            NULL,         "UTF-8"},
452    {"SJIS",       0, 1, NULL,    NULL,         NULL,            NULL,         "SJIS"},
453    {"Big5-HKSCS", 0, 1, NULL,    NULL,         NULL,            NULL,         "BIG5-HKSCS"},
454    {"gb18030",    0, 1, NULL,    NULL,         NULL,            NULL,         "GB18030"},
455    {0,            0, 0, 0,       0,            0,               0,            0}
456};
457/* *INDENT-ON* */
458
459void
460reportCharsets(void)
461{
462    const LocaleCharsetRec *p;
463    FontencCharsetPtr q;
464    printf("Known locale encodings:\n\n");
465    for (p = localeCharsets; p->name; p++) {
466	if (p->other) {
467	    printf("  %s (non-ISO-2022 encoding)\n", p->other);
468	    continue;
469	}
470	printf("  %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr);
471	if (p->g0)
472	    printf(", G0: %s", p->g0);
473	if (p->g1)
474	    printf(", G1: %s", p->g1);
475	if (p->g2)
476	    printf(", G2: %s", p->g2);
477	if (p->g3)
478	    printf(", G3: %s", p->g3);
479	printf("\n");
480    }
481
482    printf("\n\nKnown charsets (not all may be available):\n\n");
483    for (q = fontencCharsets; q->name; q++)
484	printf("  %s%s\n",
485	       q->name, q->final ? " (ISO 2022)" : "");
486}
487
488int
489getLocaleState(const char *locale,
490	       const char *charset,
491	       int *gl_return, int *gr_return,
492	       const CharsetRec * *g0_return,
493	       const CharsetRec * *g1_return,
494	       const CharsetRec * *g2_return,
495	       const CharsetRec * *g3_return,
496	       const CharsetRec * *other_return)
497{
498    int result = 0;
499    char *resolved = 0;
500    const LocaleCharsetRec *p;
501
502    if (!charset) {
503	resolved = resolveLocale(locale);
504	if (!resolved)
505	    return -1;
506	charset = strrchr(resolved, '.');
507	if (charset)
508	    charset++;
509	else
510	    charset = resolved;
511    }
512
513    for (p = localeCharsets; p->name; p++) {
514	if (compare(p->name, charset) == 0)
515	    break;
516    }
517
518    if (p->name == NULL) {
519	result = -1;
520    } else {
521
522	*gl_return = p->gl;
523	*gr_return = p->gr;
524	*g0_return = getCharsetByName(p->g0);
525	*g1_return = getCharsetByName(p->g1);
526	*g2_return = getCharsetByName(p->g2);
527	*g3_return = getCharsetByName(p->g3);
528	if (p->other)
529	    *other_return = getCharsetByName(p->other);
530	else
531	    *other_return = NULL;
532    }
533    if (resolved != 0)
534	free(resolved);
535    return result;
536}
537
538#ifdef NO_LEAKS
539static int
540isUnknownCharsetPtr(CharsetPtr p)
541{
542    return (p == &Unknown94Charset
543	    || p == &Unknown96Charset
544	    || p == &Unknown9494Charset
545	    || p == &Unknown9696Charset);
546}
547
548static void
549destroyFontencCharsetPtr(FontencCharsetPtr p)
550{
551    p->mapping = 0;
552
553    /*
554     * This should, but does not work -
555     *     FontMapReverseFree(p->reverse)
556     *
557     * The iteration for map[] is based on reading the source of
558     * FontMapReverse().
559     */
560    if (p->reverse) {
561	int n;
562	unsigned **map = p->reverse->data;
563	for (n = 0; n < 256; ++n) {
564	    if (map[n])
565		free(map[n]);
566	}
567	free(p->reverse->data);
568	free(p->reverse);
569	p->reverse = 0;
570    }
571}
572
573static void
574destroyCharset(CharsetPtr p)
575{
576    if (!isUnknownCharsetPtr(p)) {
577	destroyFontencCharsetPtr(p->data);
578	free(p);
579    }
580}
581
582void
583charset_leaks(void)
584{
585    while (cachedCharsets != 0) {
586	CharsetPtr next = cachedCharsets->next;
587	destroyCharset(cachedCharsets);
588	cachedCharsets = next;
589    }
590}
591#endif
592