citrus_utf1632.c revision 1.1 1 1.1 tshiozak /* $NetBSD $ */
2 1.1 tshiozak
3 1.1 tshiozak /*-
4 1.1 tshiozak * Copyright (c)2003 Citrus Project,
5 1.1 tshiozak * All rights reserved.
6 1.1 tshiozak *
7 1.1 tshiozak * Redistribution and use in source and binary forms, with or without
8 1.1 tshiozak * modification, are permitted provided that the following conditions
9 1.1 tshiozak * are met:
10 1.1 tshiozak * 1. Redistributions of source code must retain the above copyright
11 1.1 tshiozak * notice, this list of conditions and the following disclaimer.
12 1.1 tshiozak * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 tshiozak * notice, this list of conditions and the following disclaimer in the
14 1.1 tshiozak * documentation and/or other materials provided with the distribution.
15 1.1 tshiozak *
16 1.1 tshiozak * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 1.1 tshiozak * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 1.1 tshiozak * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 1.1 tshiozak * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 1.1 tshiozak * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 1.1 tshiozak * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 1.1 tshiozak * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 1.1 tshiozak * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 1.1 tshiozak * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 1.1 tshiozak * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 1.1 tshiozak * SUCH DAMAGE.
27 1.1 tshiozak */
28 1.1 tshiozak
29 1.1 tshiozak #include <sys/cdefs.h>
30 1.1 tshiozak #if defined(LIBC_SCCS) && !defined(lint)
31 1.1 tshiozak __RCSID("$NetBSD: citrus_utf1632.c,v 1.1 2003/06/25 09:51:49 tshiozak Exp $");
32 1.1 tshiozak #endif /* LIBC_SCCS and not lint */
33 1.1 tshiozak
34 1.1 tshiozak #include <assert.h>
35 1.1 tshiozak #include <errno.h>
36 1.1 tshiozak #include <string.h>
37 1.1 tshiozak #include <stdio.h>
38 1.1 tshiozak #include <stdlib.h>
39 1.1 tshiozak #include <stddef.h>
40 1.1 tshiozak #include <locale.h>
41 1.1 tshiozak #include <limits.h>
42 1.1 tshiozak #include <wchar.h>
43 1.1 tshiozak #include <sys/types.h>
44 1.1 tshiozak #include <sys/endian.h>
45 1.1 tshiozak
46 1.1 tshiozak #include "citrus_namespace.h"
47 1.1 tshiozak #include "citrus_types.h"
48 1.1 tshiozak #include "citrus_module.h"
49 1.1 tshiozak #include "citrus_stdenc.h"
50 1.1 tshiozak #include "citrus_bcs.h"
51 1.1 tshiozak
52 1.1 tshiozak #include "citrus_utf1632.h"
53 1.1 tshiozak
54 1.1 tshiozak
55 1.1 tshiozak /* ----------------------------------------------------------------------
56 1.1 tshiozak * private stuffs used by templates
57 1.1 tshiozak */
58 1.1 tshiozak
59 1.1 tshiozak typedef struct {
60 1.1 tshiozak u_int8_t ch[4];
61 1.1 tshiozak int chlen;
62 1.1 tshiozak int current_endian;
63 1.1 tshiozak } _UTF1632State;
64 1.1 tshiozak
65 1.1 tshiozak typedef struct {
66 1.1 tshiozak int preffered_endian;
67 1.1 tshiozak unsigned int cur_max;
68 1.1 tshiozak #define _ENDIAN_UNKNOWN 0
69 1.1 tshiozak #define _ENDIAN_BIG 1
70 1.1 tshiozak #define _ENDIAN_LITTLE 2
71 1.1 tshiozak u_int32_t mode;
72 1.1 tshiozak #define _MODE_UTF32 0x00000001U
73 1.1 tshiozak #define _MODE_FORCE_ENDIAN 0x00000002U
74 1.1 tshiozak } _UTF1632EncodingInfo;
75 1.1 tshiozak
76 1.1 tshiozak #define _FUNCNAME(m) _citrus_UTF1632_##m
77 1.1 tshiozak #define _ENCODING_INFO _UTF1632EncodingInfo
78 1.1 tshiozak #define _ENCODING_STATE _UTF1632State
79 1.1 tshiozak #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max)
80 1.1 tshiozak #define _ENCODING_IS_STATE_DEPENDENT 0
81 1.1 tshiozak #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0
82 1.1 tshiozak
83 1.1 tshiozak
84 1.1 tshiozak static __inline void
85 1.1 tshiozak /*ARGSUSED*/
86 1.1 tshiozak _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s)
87 1.1 tshiozak {
88 1.1 tshiozak memset(s, 0, sizeof(*s));
89 1.1 tshiozak }
90 1.1 tshiozak
91 1.1 tshiozak static int
92 1.1 tshiozak _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc,
93 1.1 tshiozak const char **s, size_t n, _UTF1632State *psenc,
94 1.1 tshiozak size_t *nresult)
95 1.1 tshiozak {
96 1.1 tshiozak int chlenbak, endian, needlen;
97 1.1 tshiozak wchar_t wc;
98 1.1 tshiozak size_t result;
99 1.1 tshiozak const char *s0;
100 1.1 tshiozak
101 1.1 tshiozak _DIAGASSERT(nresult != 0);
102 1.1 tshiozak _DIAGASSERT(ei != NULL);
103 1.1 tshiozak _DIAGASSERT(s != NULL);
104 1.1 tshiozak _DIAGASSERT(psenc != NULL);
105 1.1 tshiozak
106 1.1 tshiozak s0 = *s;
107 1.1 tshiozak
108 1.1 tshiozak if (s0 == NULL) {
109 1.1 tshiozak _citrus_UTF1632_init_state(ei, psenc);
110 1.1 tshiozak *nresult = 0; /* state independent */
111 1.1 tshiozak return (0);
112 1.1 tshiozak }
113 1.1 tshiozak
114 1.1 tshiozak result = 0;
115 1.1 tshiozak chlenbak = psenc->chlen;
116 1.1 tshiozak
117 1.1 tshiozak refetch:
118 1.1 tshiozak if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2)
119 1.1 tshiozak needlen = 4;
120 1.1 tshiozak else
121 1.1 tshiozak needlen = 2;
122 1.1 tshiozak
123 1.1 tshiozak while (chlenbak < needlen) {
124 1.1 tshiozak if (n==0)
125 1.1 tshiozak goto restart;
126 1.1 tshiozak psenc->ch[chlenbak++] = *s0++;
127 1.1 tshiozak n--;
128 1.1 tshiozak result++;
129 1.1 tshiozak }
130 1.1 tshiozak
131 1.1 tshiozak /* judge endian marker */
132 1.1 tshiozak if ((ei->mode & _MODE_UTF32) == 0) {
133 1.1 tshiozak /* UTF16 */
134 1.1 tshiozak if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) {
135 1.1 tshiozak psenc->current_endian = _ENDIAN_BIG;
136 1.1 tshiozak chlenbak = 0;
137 1.1 tshiozak goto refetch;
138 1.1 tshiozak } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) {
139 1.1 tshiozak psenc->current_endian = _ENDIAN_LITTLE;
140 1.1 tshiozak chlenbak = 0;
141 1.1 tshiozak goto refetch;
142 1.1 tshiozak }
143 1.1 tshiozak } else {
144 1.1 tshiozak /* UTF32 */
145 1.1 tshiozak if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 &&
146 1.1 tshiozak psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) {
147 1.1 tshiozak psenc->current_endian = _ENDIAN_BIG;
148 1.1 tshiozak chlenbak = 0;
149 1.1 tshiozak goto refetch;
150 1.1 tshiozak } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE &&
151 1.1 tshiozak psenc->ch[2]==0x00 && psenc->ch[3]==0x00) {
152 1.1 tshiozak psenc->current_endian = _ENDIAN_LITTLE;
153 1.1 tshiozak chlenbak = 0;
154 1.1 tshiozak goto refetch;
155 1.1 tshiozak }
156 1.1 tshiozak }
157 1.1 tshiozak if ((ei->mode & _MODE_FORCE_ENDIAN) != 0 ||
158 1.1 tshiozak psenc->current_endian == _ENDIAN_UNKNOWN)
159 1.1 tshiozak endian = ei->preffered_endian;
160 1.1 tshiozak else
161 1.1 tshiozak endian = psenc->current_endian;
162 1.1 tshiozak
163 1.1 tshiozak /* get wc */
164 1.1 tshiozak if ((ei->mode & _MODE_UTF32) == 0) {
165 1.1 tshiozak /* UTF16 */
166 1.1 tshiozak if (needlen==2) {
167 1.1 tshiozak switch (endian) {
168 1.1 tshiozak case _ENDIAN_LITTLE:
169 1.1 tshiozak wc = (psenc->ch[0] |
170 1.1 tshiozak ((wchar_t)psenc->ch[1] << 8));
171 1.1 tshiozak break;
172 1.1 tshiozak case _ENDIAN_BIG:
173 1.1 tshiozak wc = (psenc->ch[1] |
174 1.1 tshiozak ((wchar_t)psenc->ch[0] << 8));
175 1.1 tshiozak break;
176 1.1 tshiozak }
177 1.1 tshiozak if (wc >= 0xD800 && wc <= 0xDBFF) {
178 1.1 tshiozak /* surrogate high */
179 1.1 tshiozak needlen=4;
180 1.1 tshiozak goto refetch;
181 1.1 tshiozak }
182 1.1 tshiozak } else {
183 1.1 tshiozak /* surrogate low */
184 1.1 tshiozak wc -= 0xD800; /* wc : surrogate high (see above) */
185 1.1 tshiozak wc <<= 10;
186 1.1 tshiozak switch (endian) {
187 1.1 tshiozak case _ENDIAN_LITTLE:
188 1.1 tshiozak if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF)
189 1.1 tshiozak goto ilseq;
190 1.1 tshiozak wc |= psenc->ch[2];
191 1.1 tshiozak wc |= (wchar_t)(psenc->ch[3] & 3) << 8;
192 1.1 tshiozak break;
193 1.1 tshiozak case _ENDIAN_BIG:
194 1.1 tshiozak if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF)
195 1.1 tshiozak goto ilseq;
196 1.1 tshiozak wc |= psenc->ch[3];
197 1.1 tshiozak wc |= (wchar_t)(psenc->ch[2] & 3) << 8;
198 1.1 tshiozak break;
199 1.1 tshiozak }
200 1.1 tshiozak wc += 0x10000;
201 1.1 tshiozak }
202 1.1 tshiozak } else {
203 1.1 tshiozak /* UTF32 */
204 1.1 tshiozak switch (endian) {
205 1.1 tshiozak case _ENDIAN_LITTLE:
206 1.1 tshiozak wc = (psenc->ch[0] |
207 1.1 tshiozak ((wchar_t)psenc->ch[1] << 8) |
208 1.1 tshiozak ((wchar_t)psenc->ch[2] << 16) |
209 1.1 tshiozak ((wchar_t)psenc->ch[3] << 24));
210 1.1 tshiozak break;
211 1.1 tshiozak case _ENDIAN_BIG:
212 1.1 tshiozak wc = (psenc->ch[3] |
213 1.1 tshiozak ((wchar_t)psenc->ch[2] << 8) |
214 1.1 tshiozak ((wchar_t)psenc->ch[1] << 16) |
215 1.1 tshiozak ((wchar_t)psenc->ch[0] << 24));
216 1.1 tshiozak break;
217 1.1 tshiozak }
218 1.1 tshiozak }
219 1.1 tshiozak
220 1.1 tshiozak
221 1.1 tshiozak *pwc = wc;
222 1.1 tshiozak psenc->chlen = 0;
223 1.1 tshiozak *nresult = result;
224 1.1 tshiozak *s = s0;
225 1.1 tshiozak
226 1.1 tshiozak return (0);
227 1.1 tshiozak
228 1.1 tshiozak ilseq:
229 1.1 tshiozak *nresult = (size_t)-1;
230 1.1 tshiozak psenc->chlen = 0;
231 1.1 tshiozak return (EILSEQ);
232 1.1 tshiozak
233 1.1 tshiozak restart:
234 1.1 tshiozak *nresult = (size_t)-2;
235 1.1 tshiozak psenc->chlen = chlenbak;
236 1.1 tshiozak *s = s0;
237 1.1 tshiozak return (0);
238 1.1 tshiozak }
239 1.1 tshiozak
240 1.1 tshiozak static int
241 1.1 tshiozak _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n,
242 1.1 tshiozak wchar_t wc, _UTF1632State *psenc,
243 1.1 tshiozak size_t *nresult)
244 1.1 tshiozak {
245 1.1 tshiozak int ret;
246 1.1 tshiozak wchar_t wc2;
247 1.1 tshiozak
248 1.1 tshiozak _DIAGASSERT(ei != NULL);
249 1.1 tshiozak _DIAGASSERT(nresult != 0);
250 1.1 tshiozak _DIAGASSERT(s != NULL);
251 1.1 tshiozak
252 1.1 tshiozak /* reset state */
253 1.1 tshiozak if (wc == 0) {
254 1.1 tshiozak *nresult = 0; /* stateless */
255 1.1 tshiozak return 0;
256 1.1 tshiozak }
257 1.1 tshiozak
258 1.1 tshiozak wc2 = 0;
259 1.1 tshiozak if ((ei->mode & _MODE_UTF32)==0) {
260 1.1 tshiozak /* UTF16 */
261 1.1 tshiozak if (wc>0xFFFF) {
262 1.1 tshiozak /* surrogate */
263 1.1 tshiozak if (wc>0x10FFFF) {
264 1.1 tshiozak ret = EILSEQ;
265 1.1 tshiozak goto err;
266 1.1 tshiozak }
267 1.1 tshiozak if (n < 4) {
268 1.1 tshiozak ret = E2BIG;
269 1.1 tshiozak goto err;
270 1.1 tshiozak }
271 1.1 tshiozak wc -= 0x10000;
272 1.1 tshiozak wc2 = (wc & 0x3FF) | 0xDC00;
273 1.1 tshiozak wc = (wc>>10) | 0xD800;
274 1.1 tshiozak *nresult = (size_t)4;
275 1.1 tshiozak } else {
276 1.1 tshiozak if (n < 2) {
277 1.1 tshiozak ret = E2BIG;
278 1.1 tshiozak goto err;
279 1.1 tshiozak }
280 1.1 tshiozak *nresult = (size_t)2;
281 1.1 tshiozak }
282 1.1 tshiozak
283 1.1 tshiozak surrogate:
284 1.1 tshiozak switch (ei->preffered_endian) {
285 1.1 tshiozak case _ENDIAN_BIG:
286 1.1 tshiozak s[1] = wc;
287 1.1 tshiozak s[0] = (wc >>= 8);
288 1.1 tshiozak break;
289 1.1 tshiozak case _ENDIAN_LITTLE:
290 1.1 tshiozak s[0] = wc;
291 1.1 tshiozak s[1] = (wc >>= 8);
292 1.1 tshiozak break;
293 1.1 tshiozak }
294 1.1 tshiozak if (wc2!=0) {
295 1.1 tshiozak wc = wc2;
296 1.1 tshiozak wc2 = 0;
297 1.1 tshiozak s += 2;
298 1.1 tshiozak goto surrogate;
299 1.1 tshiozak }
300 1.1 tshiozak } else {
301 1.1 tshiozak /* UTF32 */
302 1.1 tshiozak if (n < 4) {
303 1.1 tshiozak ret = E2BIG;
304 1.1 tshiozak goto err;
305 1.1 tshiozak }
306 1.1 tshiozak switch (ei->preffered_endian) {
307 1.1 tshiozak case _ENDIAN_BIG:
308 1.1 tshiozak s[3] = wc;
309 1.1 tshiozak s[2] = (wc >>= 8);
310 1.1 tshiozak s[1] = (wc >>= 8);
311 1.1 tshiozak s[0] = (wc >>= 8);
312 1.1 tshiozak break;
313 1.1 tshiozak case _ENDIAN_LITTLE:
314 1.1 tshiozak s[0] = wc;
315 1.1 tshiozak s[1] = (wc >>= 8);
316 1.1 tshiozak s[2] = (wc >>= 8);
317 1.1 tshiozak s[3] = (wc >>= 8);
318 1.1 tshiozak break;
319 1.1 tshiozak }
320 1.1 tshiozak *nresult = (size_t)4;
321 1.1 tshiozak }
322 1.1 tshiozak
323 1.1 tshiozak return 0;
324 1.1 tshiozak
325 1.1 tshiozak err:
326 1.1 tshiozak *nresult = (size_t)-1;
327 1.1 tshiozak return ret;
328 1.1 tshiozak }
329 1.1 tshiozak
330 1.1 tshiozak static void
331 1.1 tshiozak parse_variable(_UTF1632EncodingInfo * __restrict ei,
332 1.1 tshiozak const void * __restrict var, size_t lenvar)
333 1.1 tshiozak {
334 1.1 tshiozak #define MATCH(x, act) \
335 1.1 tshiozak do { \
336 1.1 tshiozak if (lenvar >= (sizeof(#x)-1) && \
337 1.1 tshiozak _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \
338 1.1 tshiozak act; \
339 1.1 tshiozak lenvar -= sizeof(#x)-1; \
340 1.1 tshiozak p += sizeof(#x)-1; \
341 1.1 tshiozak } \
342 1.1 tshiozak } while (/*CONSTCOND*/0)
343 1.1 tshiozak const char *p;
344 1.1 tshiozak p = var;
345 1.1 tshiozak while (lenvar>0) {
346 1.1 tshiozak switch (*p) {
347 1.1 tshiozak case 'B':
348 1.1 tshiozak case 'b':
349 1.1 tshiozak MATCH(big, ei->preffered_endian = _ENDIAN_BIG);
350 1.1 tshiozak break;
351 1.1 tshiozak case 'L':
352 1.1 tshiozak case 'l':
353 1.1 tshiozak MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE);
354 1.1 tshiozak break;
355 1.1 tshiozak case 'F':
356 1.1 tshiozak case 'f':
357 1.1 tshiozak MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN);
358 1.1 tshiozak break;
359 1.1 tshiozak case 'U':
360 1.1 tshiozak case 'u':
361 1.1 tshiozak MATCH(utf32, ei->mode |= _MODE_UTF32);
362 1.1 tshiozak break;
363 1.1 tshiozak }
364 1.1 tshiozak p++;
365 1.1 tshiozak lenvar--;
366 1.1 tshiozak }
367 1.1 tshiozak }
368 1.1 tshiozak
369 1.1 tshiozak static int
370 1.1 tshiozak /*ARGSUSED*/
371 1.1 tshiozak _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei,
372 1.1 tshiozak const void * __restrict var,
373 1.1 tshiozak size_t lenvar)
374 1.1 tshiozak {
375 1.1 tshiozak _DIAGASSERT(ei != NULL);
376 1.1 tshiozak
377 1.1 tshiozak memset((void *)ei, 0, sizeof(*ei));
378 1.1 tshiozak
379 1.1 tshiozak parse_variable(ei, var, lenvar);
380 1.1 tshiozak
381 1.1 tshiozak if ((ei->mode&_MODE_UTF32)==0)
382 1.1 tshiozak ei->cur_max = 6; /* endian + surrogate */
383 1.1 tshiozak else
384 1.1 tshiozak ei->cur_max = 8; /* endian + normal */
385 1.1 tshiozak
386 1.1 tshiozak if (ei->preffered_endian == _ENDIAN_UNKNOWN) {
387 1.1 tshiozak #if BYTE_ORDER == BIG_ENDIAN
388 1.1 tshiozak ei->preffered_endian = _ENDIAN_BIG;
389 1.1 tshiozak #else
390 1.1 tshiozak ei->preffered_endian = _ENDIAN_LITTLE;
391 1.1 tshiozak #endif
392 1.1 tshiozak }
393 1.1 tshiozak
394 1.1 tshiozak return (0);
395 1.1 tshiozak }
396 1.1 tshiozak
397 1.1 tshiozak static void
398 1.1 tshiozak /*ARGSUSED*/
399 1.1 tshiozak _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei)
400 1.1 tshiozak {
401 1.1 tshiozak }
402 1.1 tshiozak
403 1.1 tshiozak static __inline int
404 1.1 tshiozak /*ARGSUSED*/
405 1.1 tshiozak _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei,
406 1.1 tshiozak _csid_t * __restrict csid,
407 1.1 tshiozak _index_t * __restrict idx,
408 1.1 tshiozak _wc_t wc)
409 1.1 tshiozak {
410 1.1 tshiozak
411 1.1 tshiozak _DIAGASSERT(csid != NULL && idx != NULL);
412 1.1 tshiozak
413 1.1 tshiozak *csid = 0;
414 1.1 tshiozak *idx = (_index_t)wc;
415 1.1 tshiozak
416 1.1 tshiozak return (0);
417 1.1 tshiozak }
418 1.1 tshiozak
419 1.1 tshiozak static __inline int
420 1.1 tshiozak /*ARGSUSED*/
421 1.1 tshiozak _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei,
422 1.1 tshiozak _wc_t * __restrict wc,
423 1.1 tshiozak _csid_t csid, _index_t idx)
424 1.1 tshiozak {
425 1.1 tshiozak
426 1.1 tshiozak _DIAGASSERT(wc != NULL);
427 1.1 tshiozak
428 1.1 tshiozak if (csid != 0)
429 1.1 tshiozak return (EILSEQ);
430 1.1 tshiozak
431 1.1 tshiozak *wc = (_wc_t)idx;
432 1.1 tshiozak
433 1.1 tshiozak return (0);
434 1.1 tshiozak }
435 1.1 tshiozak
436 1.1 tshiozak
437 1.1 tshiozak /* ----------------------------------------------------------------------
438 1.1 tshiozak * public interface for stdenc
439 1.1 tshiozak */
440 1.1 tshiozak
441 1.1 tshiozak _CITRUS_STDENC_DECLS(UTF1632);
442 1.1 tshiozak _CITRUS_STDENC_DEF_OPS(UTF1632);
443 1.1 tshiozak
444 1.1 tshiozak #include "citrus_stdenc_template.h"
445