1/*
2Copyright (c) 2001 by Juliusz Chroboczek
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifdef HAVE_CONFIG_H
24# include "config.h"
25#endif
26
27#include <stdlib.h>
28#include <stdio.h>
29#include <string.h>
30#include <ctype.h>
31
32#include "sys.h"
33#include "other.h"
34#include "charset.h"
35#include "parser.h"
36
37#ifndef NULL
38#define NULL 0
39#endif
40
41static unsigned int
42IdentityRecode(unsigned int n, const CharsetRec * self GCC_UNUSED)
43{
44    return n;
45}
46
47#ifdef UNUSED
48static int
49IdentityReverse(unsigned int n, const CharsetRec * self)
50{
51#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
52    switch (self->type) {
53    case T_94:
54    case T_96:
55	if (IS_GL(n))
56	    return n;
57	else
58	    return -1;
59    case T_128:
60	if (n < 0x80)
61	    return n;
62	else
63	    return -1;
64    case T_9494:
65    case T_9696:
66	if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
67	    return n;
68	else
69	    return -1;
70    case T_94192:
71	if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
72	    return n;
73	else
74	    return -1;
75    default:
76	abort();
77	/* NOTREACHED */
78    }
79#undef IS_GL
80}
81#endif
82
83static int
84NullReverse(unsigned int n GCC_UNUSED, const CharsetRec * self GCC_UNUSED)
85{
86    return -1;
87}
88
89static const CharsetRec Unknown94Charset =
90{"Unknown (94)", T_94, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
91static const CharsetRec Unknown96Charset =
92{"Unknown (96)", T_96, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
93static const CharsetRec Unknown9494Charset =
94{"Unknown (94x94)", T_9494, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
95static const CharsetRec Unknown9696Charset =
96{"Unknown (96x96)", T_9696, 0, IdentityRecode, NullReverse, 0, 0, 0, 0, 0, 0};
97
98typedef struct _FontencCharset {
99    const char *name;
100    int type;
101    unsigned char final;
102    const char *xlfd;
103    unsigned shift;
104    FontMapPtr mapping;
105    FontMapReversePtr reverse;
106} FontencCharsetRec, *FontencCharsetPtr;
107/* *INDENT-OFF* */
108static FontencCharsetRec fontencCharsets[] =
109{
110    {"ISO 646 (1973)", T_94,    '@', "iso646.1973-0",    0x00,   0, 0},
111    {"ASCII",          T_94,    'B', "iso8859-1",        0x00,   0, 0},
112    {"JIS X 0201:GL",  T_94,    'J', "jisx0201.1976-0",  0x00,   0, 0},
113    {"JIS X 0201:GR",  T_94,    'I', "jisx0201.1976-0",  0x80,   0, 0},
114    {"DEC Special",    T_94,    '0', "dec-special",      0x00,   0, 0},
115    {"DEC Technical",  T_94,    '>', "dec-dectech",      0x00,   0, 0},
116
117    {"ISO 8859-1",     T_96,    'A', "iso8859-1",        0x80,   0, 0},
118    {"ISO 8859-2",     T_96,    'B', "iso8859-2",        0x80,   0, 0},
119    {"ISO 8859-3",     T_96,    'C', "iso8859-3",        0x80,   0, 0},
120    {"ISO 8859-4",     T_96,    'D', "iso8859-4",        0x80,   0, 0},
121    {"ISO 8859-5",     T_96,    'L', "iso8859-5",        0x80,   0, 0},
122    {"ISO 8859-6",     T_96,    'G', "iso8859-6",        0x80,   0, 0},
123    {"ISO 8859-7",     T_96,    'F', "iso8859-7",        0x80,   0, 0},
124    {"ISO 8859-8",     T_96,    'H', "iso8859-8",        0x80,   0, 0},
125    {"ISO 8859-9",     T_96,    'M', "iso8859-9",        0x80,   0, 0},
126    {"ISO 8859-10",    T_96,    'V', "iso8859-10",       0x80,   0, 0},
127    {"ISO 8859-11",    T_96,    'T', "iso8859-11",       0x80,   0, 0},
128    {"TIS 620",        T_96,    'T', "iso8859-11",       0x80,   0, 0},
129    {"ISO 8859-13",    T_96,    'Y', "iso8859-13",       0x80,   0, 0},
130    {"ISO 8859-14",    T_96,    '_', "iso8859-14",       0x80,   0, 0},
131    {"ISO 8859-15",    T_96,    'b', "iso8859-15",       0x80,   0, 0},
132    {"ISO 8859-16",    T_96,    'f', "iso8859-16",       0x80,   0, 0},
133    {"KOI8-E",         T_96,    '@', "koi8-e",           0x80,   0, 0},
134    {"TCVN",           T_96,    'Z', "tcvn-0",           0x80,   0, 0},
135
136    {"GB 2312",        T_9494,  'A', "gb2312.1980-0",    0x0000, 0, 0},
137    {"JIS X 0208",     T_9494,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
138    {"KSC 5601",       T_9494,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
139    {"JIS X 0212",     T_9494,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
140
141    {"GB 2312",        T_9696,  'A', "gb2312.1980-0",    0x0000, 0, 0},
142    {"JIS X 0208",     T_9696,  'B', "jisx0208.1990-0",  0x0000, 0, 0},
143    {"KSC 5601",       T_9696,  'C', "ksc5601.1987-0",   0x0000, 0, 0},
144    {"JIS X 0212",     T_9696,  'D', "jisx0212.1990-0",  0x0000, 0, 0},
145
146    {"KOI8-R",         T_128,   0,   "koi8-r",           0x80,   0, 0},
147    {"KOI8-U",         T_128,   0,   "koi8-u",           0x80,   0, 0},
148    {"KOI8-RU",        T_128,   0,   "koi8-ru",          0x80,   0, 0},
149    {"CP 1252",        T_128,   0,   "microsoft-cp1252", 0x80,   0, 0},
150    {"CP 1251",        T_128,   0,   "microsoft-cp1251", 0x80,   0, 0},
151    {"CP 1250",        T_128,   0,   "microsoft-cp1250", 0x80,   0, 0},
152
153    {"CP 437",         T_128,   0,   "ibm-cp437",        0x80,   0, 0},
154    {"CP 850",         T_128,   0,   "ibm-cp850",        0x80,   0, 0},
155    {"CP 866",         T_128,   0,   "ibm-cp866",        0x80,   0, 0},
156
157    {"Big 5",          T_94192, 0,   "big5.eten-0",      0x8000, 0, 0},
158    {0,                0,       0,   0,                  0,      0, 0}
159};
160/* *INDENT-ON* */
161
162typedef struct _OtherCharset {
163    const char *name;
164    int (*init) (OtherStatePtr);
165    unsigned int (*mapping) (unsigned int, OtherStatePtr);
166    unsigned int (*reverse) (unsigned int, OtherStatePtr);
167    int (*stack) (unsigned, OtherStatePtr);
168} OtherCharsetRec, *OtherCharsetPtr;
169
170static const OtherCharsetRec otherCharsets[] =
171{
172    {"GBK", init_gbk, mapping_gbk, reverse_gbk, stack_gbk},
173    {"UTF-8", init_utf8, mapping_utf8, reverse_utf8, stack_utf8},
174    {"SJIS", init_sjis, mapping_sjis, reverse_sjis, stack_sjis},
175    {"BIG5-HKSCS", init_hkscs, mapping_hkscs, reverse_hkscs, stack_hkscs},
176    {"GB18030", init_gb18030, mapping_gb18030, reverse_gb18030, stack_gb18030},
177    {0, 0, 0, 0, 0}
178};
179
180static int
181compare(const char *s, const char *t)
182{
183    while (*s || *t) {
184	if (*s && (isspace(UChar(*s)) || *s == '-' || *s == '_'))
185	    s++;
186	else if (*t && (isspace(UChar(*t)) || *t == '-' || *t == '_'))
187	    t++;
188	else if (*s && *t && tolower(UChar(*s)) == tolower(UChar(*t))) {
189	    s++;
190	    t++;
191	} else
192	    return 1;
193    }
194    return 0;
195}
196
197static unsigned int
198FontencCharsetRecode(unsigned int n, const CharsetRec * self)
199{
200    const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
201
202    return FontEncRecode(n + fc->shift, fc->mapping);
203}
204
205static int
206FontencCharsetReverse(unsigned int i, const CharsetRec * self)
207{
208    const FontencCharsetRec *fc = (const FontencCharsetRec *) (self->data);
209    unsigned n;
210
211    n = fc->reverse->reverse(i, fc->reverse->data);
212    if (n == 0 || n < fc->shift)
213	return -1;
214    else
215	n -= fc->shift;
216
217#define IS_GL(n) ((n) >= 0x20 && (n) < 0x80)
218    switch (self->type) {
219    case T_94:
220    case T_96:
221	if (IS_GL(n))
222	    return (int) n;
223	else
224	    return -1;
225    case T_128:
226	if (n < 0x80)
227	    return (int) n;
228	else
229	    return -1;
230    case T_9494:
231    case T_9696:
232	if (IS_GL(n >> 8) && IS_GL(n & 0xFF))
233	    return (int) n;
234	else
235	    return -1;
236    case T_94192:
237	if (IS_GL(n >> 8) && IS_GL(n & 0x7F))
238	    return (int) n;
239	else
240	    return -1;
241    default:
242	abort();
243	/* NOTREACHED */
244    }
245#undef IS_GL
246}
247
248static CharsetPtr cachedCharsets = NULL;
249
250static CharsetPtr
251getCachedCharset(unsigned final, int type, const char *name)
252{
253    CharsetPtr c;
254    for (c = cachedCharsets; c; c = c->next) {
255	if (((c->type == type && c->final == final) ||
256	     (name && !compare(c->name, name))) &&
257	    (c->type != T_FAILED))
258	    return c;
259    }
260    return NULL;
261}
262
263static void
264cacheCharset(CharsetPtr c)
265{
266    c->next = cachedCharsets;
267    cachedCharsets = c;
268}
269
270static CharsetPtr
271getFontencCharset(unsigned final, int type, const char *name)
272{
273    FontencCharsetPtr fc;
274    CharsetPtr c;
275    FontMapPtr mapping;
276    FontMapReversePtr reverse;
277
278    fc = fontencCharsets;
279    while (fc->name) {
280	if (((fc->type == type && fc->final == final) ||
281	     (name && !compare(fc->name, name))) &&
282	    (fc->type != T_FAILED))
283	    break;
284	fc++;
285    }
286
287    if (!fc->name)
288	return NULL;
289
290    c = malloc(sizeof(CharsetRec));
291    if (c == NULL)
292	return NULL;
293
294    mapping = FontEncMapFind(fc->xlfd, FONT_ENCODING_UNICODE, -1, -1, NULL);
295    if (!mapping) {
296	free(c);
297	fc->type = T_FAILED;
298	return NULL;
299    }
300
301    reverse = FontMapReverse(mapping);
302    if (!reverse) {
303	free(c);
304	fc->type = T_FAILED;
305	return NULL;
306    }
307
308    fc->mapping = mapping;
309    fc->reverse = reverse;
310
311    c->name = fc->name;
312    c->type = fc->type;
313    c->final = fc->final;
314    c->recode = FontencCharsetRecode;
315    c->reverse = FontencCharsetReverse;
316    c->data = fc;
317
318    cacheCharset(c);
319    return c;
320}
321
322static CharsetPtr
323getOtherCharset(const char *name)
324{
325    const OtherCharsetRec *fc;
326    CharsetPtr c;
327    OtherStatePtr s;
328
329    fc = otherCharsets;
330    while (fc->name) {
331	if (name && !compare(fc->name, name))
332	    break;
333	fc++;
334    }
335
336    if (!fc->name)
337	return NULL;
338
339    c = malloc(sizeof(CharsetRec));
340    if (c == NULL)
341	return NULL;
342
343    s = malloc(sizeof(OtherState));
344    if (s == NULL) {
345	free(c);
346	return NULL;
347    }
348
349    c->name = fc->name;
350    c->type = T_OTHER;
351    c->final = 0;
352    c->data = fc;
353    c->other_recode = fc->mapping;
354    c->other_reverse = fc->reverse;
355    c->other_stack = fc->stack;
356    c->other_aux = s;
357
358    if (!fc->init(s)) {
359	c->type = T_FAILED;
360	return NULL;
361    }
362
363    cacheCharset(c);
364    return c;
365}
366
367const CharsetRec *
368getUnknownCharset(int type)
369{
370    switch (type) {
371    case T_94:
372	return &Unknown94Charset;
373    case T_96:
374	return &Unknown96Charset;
375    case T_9494:
376	return &Unknown9494Charset;
377    case T_9696:
378	return &Unknown9696Charset;
379    default:
380	return &Unknown94Charset;
381    }
382}
383
384const CharsetRec *
385getCharset(unsigned final, int type)
386{
387    const CharsetRec *c;
388
389    c = getCachedCharset(final, type, NULL);
390    if (c)
391	return c;
392
393    c = getFontencCharset(final, type, NULL);
394    if (c)
395	return c;
396
397    return getUnknownCharset(type);
398}
399
400const CharsetRec *
401getCharsetByName(const char *name)
402{
403    const CharsetRec *c;
404
405    if (name == NULL)
406	return getUnknownCharset(T_94);
407
408    c = getCachedCharset(0, 0, name);
409    if (c)
410	return c;
411
412    c = getFontencCharset(0, 0, name);
413    if (c)
414	return c;
415
416    c = getOtherCharset(name);
417    if (c)
418	return c;
419
420    return getUnknownCharset(T_94);
421}
422/* *INDENT-OFF* */
423static const LocaleCharsetRec localeCharsets[] =
424{
425    {"C",          0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
426    {"POSIX",      0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
427    {"ISO8859-1",  0, 2, "ASCII", NULL,         "ISO 8859-1",    NULL,         NULL},
428    {"ISO8859-2",  0, 2, "ASCII", NULL,         "ISO 8859-2",    NULL,         NULL},
429    {"ISO8859-3",  0, 2, "ASCII", NULL,         "ISO 8859-3",    NULL,         NULL},
430    {"ISO8859-4",  0, 2, "ASCII", NULL,         "ISO 8859-4",    NULL,         NULL},
431    {"ISO8859-5",  0, 2, "ASCII", NULL,         "ISO 8859-5",    NULL,         NULL},
432    {"ISO8859-6",  0, 2, "ASCII", NULL,         "ISO 8859-6",    NULL,         NULL},
433    {"ISO8859-7",  0, 2, "ASCII", NULL,         "ISO 8859-7",    NULL,         NULL},
434    {"ISO8859-8",  0, 2, "ASCII", NULL,         "ISO 8859-8",    NULL,         NULL},
435    {"ISO8859-9",  0, 2, "ASCII", NULL,         "ISO 8859-9",    NULL,         NULL},
436    {"ISO8859-10", 0, 2, "ASCII", NULL,         "ISO 8859-10",   NULL,         NULL},
437    {"ISO8859-11", 0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
438    {"TIS620",     0, 2, "ASCII", NULL,         "ISO 8859-11",   NULL,         NULL},
439    {"ISO8859-13", 0, 2, "ASCII", NULL,         "ISO 8859-13",   NULL,         NULL},
440    {"ISO8859-14", 0, 2, "ASCII", NULL,         "ISO 8859-14",   NULL,         NULL},
441    {"ISO8859-15", 0, 2, "ASCII", NULL,         "ISO 8859-15",   NULL,         NULL},
442    {"ISO8859-16", 0, 2, "ASCII", NULL,         "ISO 8859-16",   NULL,         NULL},
443    {"KOI8-R",     0, 2, "ASCII", NULL,         "KOI8-R",        NULL,         NULL},
444    {"CP1251",     0, 2, "ASCII", NULL,         "CP 1251",       NULL,         NULL},
445    {"TCVN",       0, 2, "ASCII", NULL,         "TCVN",          NULL,         NULL},
446    {"eucCN",      0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
447    {"GB2312",     0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
448    {"eucJP",      0, 1, "ASCII", "JIS X 0208", "JIS X 0201:GR", "JIS X 0212", NULL},
449    {"eucKR",      0, 1, "ASCII", "KSC 5601",   NULL,            NULL,         NULL},
450    {"eucCN",      0, 1, "ASCII", "GB 2312",    NULL,            NULL,         NULL},
451    {"Big5",       0, 1, "ASCII", "Big 5",      NULL,            NULL,         NULL},
452    {"gbk",        0, 1, NULL,    NULL,         NULL,            NULL,         "GBK"},
453    {"UTF-8",      0, 1, NULL,    NULL,         NULL,            NULL,         "UTF-8"},
454    {"SJIS",       0, 1, NULL,    NULL,         NULL,            NULL,         "SJIS"},
455    {"Big5-HKSCS", 0, 1, NULL,    NULL,         NULL,            NULL,         "BIG5-HKSCS"},
456    {"gb18030",    0, 1, NULL,    NULL,         NULL,            NULL,         "GB18030"},
457    {0,            0, 0, 0,       0,            0,               0,            0}
458};
459/* *INDENT-ON* */
460
461void
462reportCharsets(void)
463{
464    const LocaleCharsetRec *p;
465    FontencCharsetPtr q;
466    printf("Known locale encodings:\n\n");
467    for (p = localeCharsets; p->name; p++) {
468	if (p->other) {
469	    printf("  %s (non-ISO-2022 encoding)\n", p->other);
470	    continue;
471	}
472	printf("  %s: GL -> G%d, GR -> G%d", p->name, p->gl, p->gr);
473	if (p->g0)
474	    printf(", G0: %s", p->g0);
475	if (p->g1)
476	    printf(", G1: %s", p->g1);
477	if (p->g2)
478	    printf(", G2: %s", p->g2);
479	if (p->g3)
480	    printf(", G3: %s", p->g3);
481	printf("\n");
482    }
483
484    printf("\n\nKnown charsets (not all may be available):\n\n");
485    for (q = fontencCharsets; q->name; q++)
486	printf("  %s%s\n",
487	       q->name, q->final ? " (ISO 2022)" : "");
488}
489
490int
491getLocaleState(const char *locale,
492	       const char *charset,
493	       int *gl_return, int *gr_return,
494	       const CharsetRec * *g0_return,
495	       const CharsetRec * *g1_return,
496	       const CharsetRec * *g2_return,
497	       const CharsetRec * *g3_return,
498	       const CharsetRec * *other_return)
499{
500    int result = 0;
501    char *resolved = 0;
502    const LocaleCharsetRec *p;
503
504    if (!charset) {
505	resolved = resolveLocale(locale);
506	if (!resolved)
507	    return -1;
508	charset = strrchr(resolved, '.');
509	if (charset)
510	    charset++;
511	else
512	    charset = resolved;
513    }
514
515    for (p = localeCharsets; p->name; p++) {
516	if (compare(p->name, charset) == 0)
517	    break;
518    }
519
520    if (p->name == NULL) {
521	result = -1;
522    } else {
523
524	*gl_return = p->gl;
525	*gr_return = p->gr;
526	*g0_return = getCharsetByName(p->g0);
527	*g1_return = getCharsetByName(p->g1);
528	*g2_return = getCharsetByName(p->g2);
529	*g3_return = getCharsetByName(p->g3);
530	if (p->other)
531	    *other_return = getCharsetByName(p->other);
532	else
533	    *other_return = NULL;
534    }
535    if (resolved != 0)
536	free(resolved);
537    return result;
538}
539
540#ifdef NO_LEAKS
541static int
542isUnknownCharsetPtr(CharsetPtr p)
543{
544    return (p == &Unknown94Charset
545	    || p == &Unknown96Charset
546	    || p == &Unknown9494Charset
547	    || p == &Unknown9696Charset);
548}
549
550static void
551destroyFontencCharsetPtr(FontencCharsetPtr p)
552{
553    p->mapping = 0;
554
555    /*
556     * This should, but does not work -
557     *     FontMapReverseFree(p->reverse)
558     *
559     * The iteration for map[] is based on reading the source of
560     * FontMapReverse().
561     */
562    if (p->reverse) {
563	int n;
564	unsigned **map = p->reverse->data;
565	for (n = 0; n < 256; ++n) {
566	    if (map[n])
567		free(map[n]);
568	}
569	free(p->reverse->data);
570	free(p->reverse);
571	p->reverse = 0;
572    }
573}
574
575static void
576destroyCharset(CharsetPtr p)
577{
578    if (!isUnknownCharsetPtr(p)) {
579	destroyFontencCharsetPtr(p->data);
580	free(p);
581    }
582}
583
584void
585charset_leaks(void)
586{
587    while (cachedCharsets != 0) {
588	CharsetPtr next = cachedCharsets->next;
589	destroyCharset(cachedCharsets);
590	cachedCharsets = next;
591    }
592}
593#endif
594