unicode.c revision 1.2.56.1 1 1.2.56.1 jdolecek /* $NetBSD: unicode.c,v 1.2.56.1 2017/12/03 11:38:41 jdolecek Exp $ */
2 1.1 dillo
3 1.1 dillo /*-
4 1.1 dillo * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 1.1 dillo * All rights reserved.
6 1.1 dillo *
7 1.1 dillo * This code is derived from software contributed to The NetBSD Foundation
8 1.1 dillo * by Dieter Baron.
9 1.1 dillo *
10 1.1 dillo * Redistribution and use in source and binary forms, with or without
11 1.1 dillo * modification, are permitted provided that the following conditions
12 1.1 dillo * are met:
13 1.1 dillo * 1. Redistributions of source code must retain the above copyright
14 1.1 dillo * notice, this list of conditions and the following disclaimer.
15 1.1 dillo * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 dillo * notice, this list of conditions and the following disclaimer in the
17 1.1 dillo * documentation and/or other materials provided with the distribution.
18 1.1 dillo *
19 1.1 dillo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.1 dillo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.1 dillo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.1 dillo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.1 dillo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.1 dillo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.1 dillo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.1 dillo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.1 dillo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.1 dillo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.1 dillo * POSSIBILITY OF SUCH DAMAGE.
30 1.1 dillo */
31 1.1 dillo
32 1.2 lukem #include <sys/cdefs.h>
33 1.2 lukem __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.2.56.1 2017/12/03 11:38:41 jdolecek Exp $");
34 1.2 lukem
35 1.1 dillo #include <sys/null.h>
36 1.1 dillo
37 1.1 dillo #include "unicode.h"
38 1.1 dillo
39 1.1 dillo size_t
40 1.1 dillo utf8_to_utf16(uint16_t *dst, size_t dst_len,
41 1.1 dillo const char *src, size_t src_len,
42 1.1 dillo int flags, int *errp)
43 1.1 dillo {
44 1.2.56.1 jdolecek const unsigned char *s;
45 1.2.56.1 jdolecek size_t spos, dpos;
46 1.2.56.1 jdolecek int error;
47 1.2.56.1 jdolecek uint16_t c;
48 1.1 dillo
49 1.1 dillo #define IS_CONT(c) (((c)&0xc0) == 0x80)
50 1.1 dillo
51 1.2.56.1 jdolecek error = 0;
52 1.2.56.1 jdolecek s = (const unsigned char *)src;
53 1.2.56.1 jdolecek spos = dpos = 0;
54 1.2.56.1 jdolecek while (spos < src_len) {
55 1.2.56.1 jdolecek if (s[spos] < 0x80) {
56 1.2.56.1 jdolecek c = s[spos++];
57 1.2.56.1 jdolecek } else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
58 1.2.56.1 jdolecek && (spos >= src_len || !IS_CONT(s[spos+1]))
59 1.2.56.1 jdolecek && s[spos]>=0xa0) {
60 1.2.56.1 jdolecek /* not valid UTF-8, assume ISO 8859-1 */
61 1.2.56.1 jdolecek c = s[spos++];
62 1.2.56.1 jdolecek } else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
63 1.2.56.1 jdolecek /* continuation byte without lead byte
64 1.2.56.1 jdolecek * or lead byte for codepoint above 0x10ffff */
65 1.2.56.1 jdolecek error++;
66 1.2.56.1 jdolecek spos++;
67 1.2.56.1 jdolecek continue;
68 1.2.56.1 jdolecek } else if (s[spos] < 0xe0) {
69 1.2.56.1 jdolecek if (spos >= src_len || !IS_CONT(s[spos+1])) {
70 1.2.56.1 jdolecek spos++;
71 1.2.56.1 jdolecek error++;
72 1.2.56.1 jdolecek continue;
73 1.2.56.1 jdolecek }
74 1.2.56.1 jdolecek c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
75 1.2.56.1 jdolecek spos += 2;
76 1.2.56.1 jdolecek if (c < 0x80) {
77 1.2.56.1 jdolecek /* overlong encoding */
78 1.2.56.1 jdolecek error++;
79 1.2.56.1 jdolecek continue;
80 1.2.56.1 jdolecek }
81 1.2.56.1 jdolecek } else if (s[spos] < 0xf0) {
82 1.2.56.1 jdolecek if (spos >= src_len-2 ||
83 1.2.56.1 jdolecek !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
84 1.2.56.1 jdolecek spos++;
85 1.2.56.1 jdolecek error++;
86 1.2.56.1 jdolecek continue;
87 1.2.56.1 jdolecek }
88 1.2.56.1 jdolecek c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
89 1.2.56.1 jdolecek | (s[spos+2] & 0x3f);
90 1.2.56.1 jdolecek spos += 3;
91 1.2.56.1 jdolecek if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
92 1.2.56.1 jdolecek /* overlong encoding or encoded surrogate */
93 1.2.56.1 jdolecek error++;
94 1.2.56.1 jdolecek continue;
95 1.2.56.1 jdolecek }
96 1.2.56.1 jdolecek } else {
97 1.2.56.1 jdolecek uint32_t cc;
98 1.2.56.1 jdolecek /* UTF-16 surrogate pair */
99 1.2.56.1 jdolecek
100 1.2.56.1 jdolecek if (spos >= src_len-3 || !IS_CONT(s[spos+1])
101 1.2.56.1 jdolecek || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
102 1.2.56.1 jdolecek spos++;
103 1.2.56.1 jdolecek error++;
104 1.2.56.1 jdolecek continue;
105 1.2.56.1 jdolecek }
106 1.2.56.1 jdolecek cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
107 1.2.56.1 jdolecek | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
108 1.2.56.1 jdolecek spos += 4;
109 1.2.56.1 jdolecek if (cc < 0x10000) {
110 1.2.56.1 jdolecek /* overlong encoding */
111 1.2.56.1 jdolecek error++;
112 1.2.56.1 jdolecek continue;
113 1.2.56.1 jdolecek }
114 1.2.56.1 jdolecek if (dst && dpos < dst_len)
115 1.2.56.1 jdolecek dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
116 1.2.56.1 jdolecek dpos++;
117 1.2.56.1 jdolecek c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
118 1.2.56.1 jdolecek }
119 1.2.56.1 jdolecek
120 1.2.56.1 jdolecek if (dst && dpos < dst_len)
121 1.2.56.1 jdolecek dst[dpos] = c;
122 1.2.56.1 jdolecek dpos++;
123 1.2.56.1 jdolecek }
124 1.2.56.1 jdolecek
125 1.2.56.1 jdolecek if (errp)
126 1.2.56.1 jdolecek *errp = error;
127 1.2.56.1 jdolecek return dpos;
128 1.1 dillo #undef IS_CONT
129 1.1 dillo }
130 1.1 dillo
131 1.1 dillo
132 1.1 dillo size_t
133 1.1 dillo utf16_to_utf8(char *dst, size_t dst_len,
134 1.1 dillo const uint16_t *src, size_t src_len,
135 1.1 dillo int flags, int *errp)
136 1.1 dillo {
137 1.2.56.1 jdolecek uint8_t spos, dpos;
138 1.2.56.1 jdolecek int error;
139 1.1 dillo
140 1.1 dillo #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
141 1.1 dillo #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
142 1.1 dillo
143 1.2.56.1 jdolecek error = 0;
144 1.2.56.1 jdolecek dpos = 0;
145 1.2.56.1 jdolecek for (spos = 0; spos < src_len; spos++) {
146 1.2.56.1 jdolecek if (src[spos] < 0x80) {
147 1.2.56.1 jdolecek CHECK_LENGTH(1);
148 1.2.56.1 jdolecek ADD_BYTE(src[spos]);
149 1.2.56.1 jdolecek } else if (src[spos] < 0x800) {
150 1.2.56.1 jdolecek CHECK_LENGTH(2);
151 1.2.56.1 jdolecek ADD_BYTE(0xc0 | (src[spos]>>6));
152 1.2.56.1 jdolecek ADD_BYTE(0x80 | (src[spos] & 0x3f));
153 1.2.56.1 jdolecek } else if ((src[spos] & 0xdc00) == 0xd800) {
154 1.2.56.1 jdolecek uint32_t c;
155 1.2.56.1 jdolecek /* first surrogate */
156 1.2.56.1 jdolecek if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
157 1.2.56.1 jdolecek /* no second surrogate present */
158 1.2.56.1 jdolecek error++;
159 1.2.56.1 jdolecek continue;
160 1.2.56.1 jdolecek }
161 1.2.56.1 jdolecek spos++;
162 1.2.56.1 jdolecek CHECK_LENGTH(4);
163 1.2.56.1 jdolecek c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
164 1.2.56.1 jdolecek ADD_BYTE(0xf0 | (c>>18));
165 1.2.56.1 jdolecek ADD_BYTE(0x80 | ((c>>12) & 0x3f));
166 1.2.56.1 jdolecek ADD_BYTE(0x80 | ((c>>6) & 0x3f));
167 1.2.56.1 jdolecek ADD_BYTE(0x80 | (c & 0x3f));
168 1.2.56.1 jdolecek } else if ((src[spos] & 0xdc00) == 0xdc00) {
169 1.2.56.1 jdolecek /* second surrogate without preceding first surrogate */
170 1.2.56.1 jdolecek error++;
171 1.2.56.1 jdolecek } else {
172 1.2.56.1 jdolecek CHECK_LENGTH(3);
173 1.2.56.1 jdolecek ADD_BYTE(0xe0 | src[spos]>>12);
174 1.2.56.1 jdolecek ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
175 1.2.56.1 jdolecek ADD_BYTE(0x80 | (src[spos] & 0x3f));
176 1.2.56.1 jdolecek }
177 1.2.56.1 jdolecek }
178 1.2.56.1 jdolecek
179 1.2.56.1 jdolecek if (errp)
180 1.2.56.1 jdolecek *errp = error;
181 1.2.56.1 jdolecek return dpos;
182 1.1 dillo #undef ADD_BYTE
183 1.1 dillo #undef CHECK_LENGTH
184 1.1 dillo }
185