citrus_mskanji.c revision 1.14.22.3 1 /* $NetBSD: citrus_mskanji.c,v 1.14.22.3 2017/07/31 04:23:35 perseant Exp $ */
2
3 /*-
4 * Copyright (c)2002 Citrus Project,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * ja_JP.SJIS locale table for BSD4.4/rune
31 * version 1.0
32 * (C) Sin'ichiro MIYATANI / Phase One, Inc
33 * May 12, 1995
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed by Phase One, Inc.
46 * 4. The name of Phase One, Inc. may be used to endorse or promote products
47 * derived from this software without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 */
61
62
63 #include <sys/cdefs.h>
64 #if defined(LIBC_SCCS) && !defined(lint)
65 __RCSID("$NetBSD: citrus_mskanji.c,v 1.14.22.3 2017/07/31 04:23:35 perseant Exp $");
66 #endif /* LIBC_SCCS and not lint */
67
68 #include <assert.h>
69 #include <errno.h>
70 #include <string.h>
71 #include <stdio.h>
72 #include <stdlib.h>
73 #include <stddef.h>
74 #include <wchar.h>
75 #include <sys/types.h>
76 #include <limits.h>
77
78 #include "citrus_namespace.h"
79 #include "citrus_types.h"
80 #include "citrus_bcs.h"
81 #include "citrus_module.h"
82 #include "citrus_ctype.h"
83 #include "citrus_stdenc.h"
84 #include "citrus_mskanji.h"
85
86
87 /* ----------------------------------------------------------------------
88 * private stuffs used by templates
89 */
90
91 typedef struct _MSKanjiState {
92 char ch[2];
93 int chlen;
94 } _MSKanjiState;
95
96 typedef struct {
97 int mode;
98 #define MODE_JIS2004 1
99 } _MSKanjiEncodingInfo;
100
101 typedef struct {
102 _MSKanjiEncodingInfo ei;
103 struct {
104 /* for future multi-locale facility */
105 _MSKanjiState s_mblen;
106 _MSKanjiState s_mbrlen;
107 _MSKanjiState s_mbrtowc;
108 _MSKanjiState s_mbtowc;
109 _MSKanjiState s_mbsrtowcs;
110 _MSKanjiState s_mbsnrtowcs;
111 _MSKanjiState s_wcrtomb;
112 _MSKanjiState s_wcsrtombs;
113 _MSKanjiState s_wcsnrtombs;
114 _MSKanjiState s_wctomb;
115 } states;
116 } _MSKanjiCTypeInfo;
117
118 #define _CEI_TO_EI(_cei_) (&(_cei_)->ei)
119 #define _CEI_TO_STATE(_cei_, _func_) (_cei_)->states.s_##_func_
120
121 #define _FUNCNAME(m) _citrus_MSKanji_##m
122 #define _ENCODING_INFO _MSKanjiEncodingInfo
123 #define _CTYPE_INFO _MSKanjiCTypeInfo
124 #define _ENCODING_STATE _MSKanjiState
125 #define _ENCODING_MB_CUR_MAX(_ei_) 2
126 #define _ENCODING_IS_STATE_DEPENDENT 0
127 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0
128
129 #ifdef __STDC_ISO_10646__
130 #include "citrus_mskanji_data.h"
131
132 static __inline int
133 /*ARGSUSED*/
134 _FUNCNAME(ucs2kt)(_ENCODING_INFO * __restrict ei,
135 wchar_kuten_t * __restrict ktp, wchar_ucs4_t wc)
136 {
137 struct unicode2kuten_lookup *uk;
138
139 _DIAGASSERT(ktp != NULL);
140
141 /* US-ASCII are not in the list */
142 if (wc < 0x80) {
143 *ktp = wc;
144 return 0;
145 }
146
147 uk = _citrus_uk_bsearch(wc, __shiftjis_mskanji_table__unicode2kuten_lookup, _SHIFTJIS_MSKANJI_TABLE__U2K_LIST_LENGTH);
148
149 if (uk == NULL)
150 *ktp = WEOF;
151 else
152 *ktp = uk->value;
153
154 return 0;
155 }
156
157 static __inline int
158 /*ARGSUSED*/
159 _FUNCNAME(kt2ucs)(_ENCODING_INFO * __restrict ei,
160 wchar_ucs4_t * __restrict up, wchar_kuten_t kt)
161 {
162 _csid_t csid;
163 _index_t idx;
164 struct unicode2kuten_lookup *uk, *table;
165
166 _DIAGASSERT(up != NULL);
167
168 table = NULL;
169
170 /* Special cases */
171 if (kt == 0x5c) { /* backslash -> Yen sign */
172 *up = 0x00A5;
173 return 0;
174 }
175 if (kt == 0x7E) { /* tilde -> overline */
176 *up = 0x203E;
177 return 0;
178 }
179 if (kt < 0x80) {
180 *up = kt;
181 return 0;
182 }
183
184 uk = _citrus_uk_bsearch(kt, __shiftjis_mskanji_table__kuten2unicode_lookup, _SHIFTJIS_MSKANJI_TABLE__K2U_LIST_LENGTH);
185
186 if (uk == NULL)
187 *up = WEOF;
188 else
189 *up = uk->value;
190
191 return 0;
192 }
193 #else
194 #include "citrus_u2k_template.h"
195 #endif
196
197 static int
198 _mskanji1(int c)
199 {
200
201 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc))
202 return 1;
203 else
204 return 0;
205 }
206
207 static int
208 _mskanji2(int c)
209 {
210
211 if ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc))
212 return 1;
213 else
214 return 0;
215 }
216
217 static __inline void
218 /*ARGSUSED*/
219 _citrus_MSKanji_init_state(_MSKanjiEncodingInfo * __restrict ei,
220 _MSKanjiState * __restrict s)
221 {
222 s->chlen = 0;
223 }
224
225 static __inline void
226 /*ARGSUSED*/
227 _citrus_MSKanji_pack_state(_MSKanjiEncodingInfo * __restrict ei,
228 void * __restrict pspriv,
229 const _MSKanjiState * __restrict s)
230 {
231 memcpy(pspriv, (const void *)s, sizeof(*s));
232 }
233
234 static __inline void
235 /*ARGSUSED*/
236 _citrus_MSKanji_unpack_state(_MSKanjiEncodingInfo * __restrict ei,
237 _MSKanjiState * __restrict s,
238 const void * __restrict pspriv)
239 {
240 memcpy((void *)s, pspriv, sizeof(*s));
241 }
242
243 static int
244 /*ARGSUSED*/
245 _citrus_MSKanji_mbrtowc_priv(_MSKanjiEncodingInfo * __restrict ei,
246 wchar_ucs4_t * __restrict pwc,
247 const char ** __restrict s, size_t n,
248 _MSKanjiState * __restrict psenc,
249 size_t * __restrict nresult)
250 {
251 wchar_kuten_t wchar;
252 int len;
253 int chlenbak;
254 const char *s0;
255
256 _DIAGASSERT(nresult != 0);
257 _DIAGASSERT(ei != NULL);
258 _DIAGASSERT(s != NULL);
259 _DIAGASSERT(psenc != NULL);
260
261 s0 = *s;
262
263 if (s0 == NULL) {
264 _citrus_MSKanji_init_state(ei, psenc);
265 *nresult = 0; /* state independent */
266 return (0);
267 }
268
269 chlenbak = psenc->chlen;
270
271 /* make sure we have the first byte in the buffer */
272 switch (psenc->chlen) {
273 case 0:
274 if (n < 1)
275 goto restart;
276 psenc->ch[0] = *s0++;
277 psenc->chlen = 1;
278 n--;
279 break;
280 case 1:
281 break;
282 default:
283 /* illegal state */
284 goto encoding_error;
285 }
286
287 len = _mskanji1(psenc->ch[0] & 0xff) ? 2 : 1;
288 while (psenc->chlen < len) {
289 if (n < 1)
290 goto restart;
291 psenc->ch[psenc->chlen] = *s0++;
292 psenc->chlen++;
293 n--;
294 }
295
296 *s = s0;
297
298 switch (len) {
299 case 1:
300 wchar = psenc->ch[0] & 0xff;
301 break;
302 case 2:
303 if (!_mskanji2(psenc->ch[1] & 0xff))
304 goto encoding_error;
305 wchar = ((psenc->ch[0] & 0xff) << 8) | (psenc->ch[1] & 0xff);
306 break;
307 default:
308 /* illegal state */
309 goto encoding_error;
310 }
311
312 psenc->chlen = 0;
313
314 if (pwc)
315 _citrus_MSKanji_kt2ucs(ei, pwc, wchar);
316
317 if (!wchar)
318 *nresult = 0;
319 else
320 *nresult = len - chlenbak;
321
322 return (0);
323
324 encoding_error:
325 psenc->chlen = 0;
326 *nresult = (size_t)-1;
327 return (EILSEQ);
328
329 restart:
330 *nresult = (size_t)-2;
331 *s = s0;
332 return (0);
333 }
334
335
336 static int
337 _citrus_MSKanji_wcrtomb_priv(_MSKanjiEncodingInfo * __restrict ei,
338 char * __restrict s, size_t n, wchar_ucs4_t wc,
339 _MSKanjiState * __restrict psenc,
340 size_t * __restrict nresult)
341 {
342 int ret;
343
344 _DIAGASSERT(ei != NULL);
345 _DIAGASSERT(psenc != NULL);
346 _DIAGASSERT(s != NULL);
347
348 _citrus_MSKanji_ucs2kt(ei, &wc, wc);
349
350 /* check invalid sequence */
351 if (wc & ~0xffff) {
352 ret = EILSEQ;
353 goto err;
354 }
355
356 if (wc & 0xff00) {
357 if (n < 2) {
358 ret = E2BIG;
359 goto err;
360 }
361
362 s[0] = (wc >> 8) & 0xff;
363 s[1] = wc & 0xff;
364 if (!_mskanji1(s[0] & 0xff) || !_mskanji2(s[1] & 0xff)) {
365 ret = EILSEQ;
366 goto err;
367 }
368
369 *nresult = 2;
370 return 0;
371 } else {
372 if (n < 1) {
373 ret = E2BIG;
374 goto err;
375 }
376
377 s[0] = wc & 0xff;
378 if (_mskanji1(s[0] & 0xff)) {
379 ret = EILSEQ;
380 goto err;
381 }
382
383 *nresult = 1;
384 return 0;
385 }
386
387 err:
388 *nresult = (size_t)-1;
389 return ret;
390 }
391
392
393 static int
394 /*ARGSUSED*/
395 _citrus_MSKanji_stdenc_wctocs(struct _citrus_stdenc *ce,
396 _csid_t * __restrict csid,
397 _index_t * __restrict idx, wchar_kuten_t wc)
398 {
399 _MSKanjiEncodingInfo *ei;
400 _index_t row, col;
401 int offset;
402
403 _DIAGASSERT(ce != NULL && csid != NULL && idx != NULL);
404
405 ei = (_ENCODING_INFO *)(ce->ce_closure);
406 _DIAGASSERT(ei != NULL);
407
408 if ((_wc_t)wc < 0x80) {
409 /* ISO-646 */
410 *csid = 0;
411 *idx = (_index_t)wc;
412 } else if ((_wc_t)wc < 0x100) {
413 /* KANA */
414 *csid = 1;
415 *idx = (_index_t)wc & 0x7F;
416 } else {
417 /* Kanji (containing Gaiji zone) */
418 /*
419 * 94^2 zone (contains a part of Gaiji (0xED40 - 0xEEFC)):
420 * 0x8140 - 0x817E -> 0x2121 - 0x215F
421 * 0x8180 - 0x819E -> 0x2160 - 0x217E
422 * 0x819F - 0x81FC -> 0x2221 - 0x227E
423 *
424 * 0x8240 - 0x827E -> 0x2321 - 0x235F
425 * ...
426 * 0x9F9F - 0x9FFc -> 0x5E21 - 0x5E7E
427 *
428 * 0xE040 - 0xE07E -> 0x5F21 - 0x5F5F
429 * ...
430 * 0xEF9F - 0xEFFC -> 0x7E21 - 0x7E7E
431 *
432 * extended Gaiji zone:
433 * 0xF040 - 0xFCFC
434 *
435 * JIS X0213-plane2:
436 * 0xF040 - 0xF09E -> 0x2121 - 0x217E
437 * 0xF140 - 0xF19E -> 0x2321 - 0x237E
438 * ...
439 * 0xF240 - 0xF29E -> 0x2521 - 0x257E
440 *
441 * 0xF09F - 0xF0FC -> 0x2821 - 0x287E
442 * 0xF29F - 0xF2FC -> 0x2C21 - 0x2C7E
443 * ...
444 * 0xF44F - 0xF49E -> 0x2F21 - 0x2F7E
445 *
446 * 0xF49F - 0xF4FC -> 0x6E21 - 0x6E7E
447 * ...
448 * 0xFC9F - 0xFCFC -> 0x7E21 - 0x7E7E
449 */
450 row = ((_wc_t)wc >> 8) & 0xFF;
451 col = (_wc_t)wc & 0xFF;
452 if (!_mskanji1(row) || !_mskanji2(col))
453 return EILSEQ;
454 if ((ei->mode & MODE_JIS2004) == 0 || row < 0xF0) {
455 *csid = 2;
456 offset = 0x81;
457 } else {
458 *csid = 3;
459 if ((_wc_t)wc <= 0xF49E) {
460 offset = (_wc_t)wc >= 0xF29F ||
461 ((_wc_t)wc >= 0xF09F && (_wc_t)wc <= 0xF0FC)
462 ? 0xED : 0xF0;
463 } else
464 offset = 0xCE;
465 }
466 row -= offset;
467 if (row >= 0x5F)
468 row -= 0x40;
469 row = row * 2 + 0x21;
470 col -= 0x1F;
471 if (col >= 0x61)
472 col -= 1;
473 if (col > 0x7E) {
474 row += 1;
475 col -= 0x5E;
476 }
477 *idx = ((_index_t)row << 8) | col;
478 }
479
480 return 0;
481 }
482
483 static int
484 /*ARGSUSED*/
485 _citrus_MSKanji_stdenc_cstowc(struct _citrus_stdenc *ce,
486 wchar_kuten_t * __restrict wc,
487 _csid_t csid, _index_t idx)
488 {
489 u_int32_t row, col;
490 int offset;
491 _MSKanjiEncodingInfo *ei;
492
493 _DIAGASSERT(wc != NULL && ce != NULL);
494
495 ei = (_ENCODING_INFO *)(ce->ce_closure);
496 _DIAGASSERT(ei != NULL);
497
498 switch (csid) {
499 case 0:
500 /* ISO-646 */
501 if (idx >= 0x80)
502 return EILSEQ;
503 *wc = (wchar_kuten_t)idx;
504 break;
505 case 1:
506 /* kana */
507 if (idx >= 0x80)
508 return EILSEQ;
509 *wc = (wchar_kuten_t)idx + 0x80;
510 break;
511 case 3:
512 if ((ei->mode & MODE_JIS2004) == 0)
513 return EILSEQ;
514 /*FALLTHROUGH*/
515 case 2:
516 /* kanji */
517 row = (idx >> 8);
518 if (row < 0x21)
519 return EILSEQ;
520 if (csid == 3) {
521 if (row <= 0x2F)
522 offset = (row == 0x22 || row >= 0x26)
523 ? 0xED : 0xF0;
524 else if (row >= 0x4D && row <= 0x7E)
525 offset = 0xCE;
526 else
527 return EILSEQ;
528 } else {
529 if (row > 0x97)
530 return EILSEQ;
531 offset = (row < 0x5F) ? 0x81 : 0xC1;
532 }
533 col = idx & 0xFF;
534 if (col < 0x21 || col > 0x7E)
535 return EILSEQ;
536 row -= 0x21; col -= 0x21;
537 if ((row & 1) == 0) {
538 col += 0x40;
539 if (col >= 0x7F)
540 col += 1;
541 } else
542 col += 0x9F;
543 row = row / 2 + offset;
544 *wc = ((wchar_kuten_t)row << 8) | col;
545 break;
546 default:
547 return EILSEQ;
548 }
549
550 return 0;
551 }
552
553 static __inline int
554 /*ARGSUSED*/
555 _citrus_MSKanji_stdenc_get_state_desc_generic(_MSKanjiEncodingInfo * __restrict ei,
556 _MSKanjiState * __restrict psenc,
557 int * __restrict rstate)
558 {
559
560 if (psenc->chlen == 0)
561 *rstate = _STDENC_SDGEN_INITIAL;
562 else
563 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR;
564
565 return 0;
566 }
567
568 static int
569 /*ARGSUSED*/
570 _citrus_MSKanji_encoding_module_init(_MSKanjiEncodingInfo * __restrict ei,
571 const void * __restrict var,
572 size_t lenvar)
573 {
574 const char *p;
575
576 _DIAGASSERT(ei != NULL);
577
578 p = var;
579 #define MATCH(x, act) \
580 do { \
581 if (lenvar >= (sizeof(#x)-1) && \
582 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \
583 act; \
584 lenvar -= sizeof(#x)-1; \
585 p += sizeof(#x)-1; \
586 } \
587 } while (/*CONSTCOND*/0)
588 memset((void *)ei, 0, sizeof(*ei));
589 while (lenvar > 0) {
590 switch (_bcs_toupper(*p)) {
591 case 'J':
592 MATCH(JIS2004, ei->mode |= MODE_JIS2004);
593 break;
594 }
595 ++p;
596 --lenvar;
597 }
598
599 return 0;
600 }
601
602 static void
603 _citrus_MSKanji_encoding_module_uninit(_MSKanjiEncodingInfo *ei)
604 {
605 }
606
607 /* ----------------------------------------------------------------------
608 * public interface for ctype
609 */
610
611 _CITRUS_CTYPE_DECLS(MSKanji);
612 _CITRUS_CTYPE_DEF_OPS(MSKanji);
613
614 #include "citrus_ctype_template.h"
615
616 /* ----------------------------------------------------------------------
617 * public interface for stdenc
618 */
619
620 _CITRUS_STDENC_DECLS(MSKanji);
621 _CITRUS_STDENC_DEF_OPS(MSKanji);
622
623 #include "citrus_stdenc_template.h"
624