unicode.c revision 1.1 1 1.1 dillo /* $NetBSD: unicode.c,v 1.1 2007/03/06 00:10:39 dillo Exp $ */
2 1.1 dillo
3 1.1 dillo /*-
4 1.1 dillo * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 1.1 dillo * All rights reserved.
6 1.1 dillo *
7 1.1 dillo * This code is derived from software contributed to The NetBSD Foundation
8 1.1 dillo * by Dieter Baron.
9 1.1 dillo *
10 1.1 dillo * Redistribution and use in source and binary forms, with or without
11 1.1 dillo * modification, are permitted provided that the following conditions
12 1.1 dillo * are met:
13 1.1 dillo * 1. Redistributions of source code must retain the above copyright
14 1.1 dillo * notice, this list of conditions and the following disclaimer.
15 1.1 dillo * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 dillo * notice, this list of conditions and the following disclaimer in the
17 1.1 dillo * documentation and/or other materials provided with the distribution.
18 1.1 dillo *
19 1.1 dillo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.1 dillo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.1 dillo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.1 dillo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.1 dillo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.1 dillo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.1 dillo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.1 dillo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.1 dillo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.1 dillo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.1 dillo * POSSIBILITY OF SUCH DAMAGE.
30 1.1 dillo */
31 1.1 dillo
32 1.1 dillo #include <sys/null.h>
33 1.1 dillo
34 1.1 dillo #include "unicode.h"
35 1.1 dillo
36 1.1 dillo size_t
37 1.1 dillo utf8_to_utf16(uint16_t *dst, size_t dst_len,
38 1.1 dillo const char *src, size_t src_len,
39 1.1 dillo int flags, int *errp)
40 1.1 dillo {
41 1.1 dillo const unsigned char *s;
42 1.1 dillo size_t spos, dpos;
43 1.1 dillo int error;
44 1.1 dillo uint16_t c;
45 1.1 dillo
46 1.1 dillo #define IS_CONT(c) (((c)&0xc0) == 0x80)
47 1.1 dillo
48 1.1 dillo error = 0;
49 1.1 dillo s = (const unsigned char *)src;
50 1.1 dillo spos = dpos = 0;
51 1.1 dillo while (spos<src_len) {
52 1.1 dillo if (s[spos] < 0x80)
53 1.1 dillo c = s[spos++];
54 1.1 dillo else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
55 1.1 dillo && (spos >= src_len || !IS_CONT(s[spos+1]))
56 1.1 dillo && s[spos]>=0xa0) {
57 1.1 dillo /* not valid UTF-8, assume ISO 8859-1 */
58 1.1 dillo c = s[spos++];
59 1.1 dillo }
60 1.1 dillo else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
61 1.1 dillo /* continuation byte without lead byte
62 1.1 dillo or lead byte for codepoint above 0x10ffff */
63 1.1 dillo error++;
64 1.1 dillo spos++;
65 1.1 dillo continue;
66 1.1 dillo }
67 1.1 dillo else if (s[spos] < 0xe0) {
68 1.1 dillo if (spos >= src_len || !IS_CONT(s[spos+1])) {
69 1.1 dillo spos++;
70 1.1 dillo error++;
71 1.1 dillo continue;
72 1.1 dillo }
73 1.1 dillo c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
74 1.1 dillo spos += 2;
75 1.1 dillo if (c < 0x80) {
76 1.1 dillo /* overlong encoding */
77 1.1 dillo error++;
78 1.1 dillo continue;
79 1.1 dillo }
80 1.1 dillo }
81 1.1 dillo else if (s[spos] < 0xf0) {
82 1.1 dillo if (spos >= src_len-2
83 1.1 dillo || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
84 1.1 dillo spos++;
85 1.1 dillo error++;
86 1.1 dillo continue;
87 1.1 dillo }
88 1.1 dillo c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
89 1.1 dillo | (s[spos+2] & 0x3f);
90 1.1 dillo spos += 3;
91 1.1 dillo if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
92 1.1 dillo /* overlong encoding or encoded surrogate */
93 1.1 dillo error++;
94 1.1 dillo continue;
95 1.1 dillo }
96 1.1 dillo }
97 1.1 dillo else {
98 1.1 dillo uint32_t cc;
99 1.1 dillo /* UTF-16 surrogate pair */
100 1.1 dillo
101 1.1 dillo if (spos >= src_len-3 || !IS_CONT(s[spos+1])
102 1.1 dillo || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
103 1.1 dillo spos++;
104 1.1 dillo error++;
105 1.1 dillo
106 1.1 dillo continue;
107 1.1 dillo }
108 1.1 dillo cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
109 1.1 dillo | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
110 1.1 dillo spos += 4;
111 1.1 dillo if (cc < 0x10000) {
112 1.1 dillo /* overlong encoding */
113 1.1 dillo error++;
114 1.1 dillo continue;
115 1.1 dillo }
116 1.1 dillo if (dst && dpos < dst_len)
117 1.1 dillo dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
118 1.1 dillo dpos++;
119 1.1 dillo c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
120 1.1 dillo }
121 1.1 dillo
122 1.1 dillo if (dst && dpos < dst_len)
123 1.1 dillo dst[dpos] = c;
124 1.1 dillo dpos++;
125 1.1 dillo }
126 1.1 dillo
127 1.1 dillo if (errp)
128 1.1 dillo *errp = error;
129 1.1 dillo
130 1.1 dillo return dpos;
131 1.1 dillo
132 1.1 dillo #undef IS_CONT
133 1.1 dillo }
134 1.1 dillo
135 1.1 dillo
136 1.1 dillo size_t
137 1.1 dillo utf16_to_utf8(char *dst, size_t dst_len,
138 1.1 dillo const uint16_t *src, size_t src_len,
139 1.1 dillo int flags, int *errp)
140 1.1 dillo {
141 1.1 dillo uint8_t spos, dpos;
142 1.1 dillo int error;
143 1.1 dillo
144 1.1 dillo #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
145 1.1 dillo #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
146 1.1 dillo
147 1.1 dillo error = 0;
148 1.1 dillo dpos = 0;
149 1.1 dillo for (spos=0; spos<src_len; spos++) {
150 1.1 dillo if (src[spos] < 0x80) {
151 1.1 dillo CHECK_LENGTH(1);
152 1.1 dillo ADD_BYTE(src[spos]);
153 1.1 dillo }
154 1.1 dillo else if (src[spos] < 0x800) {
155 1.1 dillo CHECK_LENGTH(2);
156 1.1 dillo ADD_BYTE(0xc0 | (src[spos]>>6));
157 1.1 dillo ADD_BYTE(0x80 | (src[spos] & 0x3f));
158 1.1 dillo }
159 1.1 dillo else if ((src[spos] & 0xdc00) == 0xd800) {
160 1.1 dillo uint32_t c;
161 1.1 dillo /* first surrogate */
162 1.1 dillo if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
163 1.1 dillo /* no second surrogate present */
164 1.1 dillo error++;
165 1.1 dillo continue;
166 1.1 dillo }
167 1.1 dillo spos++;
168 1.1 dillo CHECK_LENGTH(4);
169 1.1 dillo c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
170 1.1 dillo ADD_BYTE(0xf0 | (c>>18));
171 1.1 dillo ADD_BYTE(0x80 | ((c>>12) & 0x3f));
172 1.1 dillo ADD_BYTE(0x80 | ((c>>6) & 0x3f));
173 1.1 dillo ADD_BYTE(0x80 | (c & 0x3f));
174 1.1 dillo }
175 1.1 dillo else if ((src[spos] & 0xdc00) == 0xdc00) {
176 1.1 dillo /* second surrogate without preceding first surrogate */
177 1.1 dillo error++;
178 1.1 dillo }
179 1.1 dillo else {
180 1.1 dillo CHECK_LENGTH(3);
181 1.1 dillo ADD_BYTE(0xe0 | src[spos]>>12);
182 1.1 dillo ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
183 1.1 dillo ADD_BYTE(0x80 | (src[spos] & 0x3f));
184 1.1 dillo }
185 1.1 dillo }
186 1.1 dillo
187 1.1 dillo if (errp)
188 1.1 dillo *errp = error;
189 1.1 dillo
190 1.1 dillo return dpos;
191 1.1 dillo
192 1.1 dillo #undef ADD_BYTE
193 1.1 dillo #undef CHECK_LENGTH
194 1.1 dillo }
195