1 1.3 maxv /* $NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $ */ 2 1.1 dillo 3 1.1 dillo /*- 4 1.1 dillo * Copyright (c) 2007 The NetBSD Foundation, Inc. 5 1.1 dillo * All rights reserved. 6 1.1 dillo * 7 1.1 dillo * This code is derived from software contributed to The NetBSD Foundation 8 1.1 dillo * by Dieter Baron. 9 1.1 dillo * 10 1.1 dillo * Redistribution and use in source and binary forms, with or without 11 1.1 dillo * modification, are permitted provided that the following conditions 12 1.1 dillo * are met: 13 1.1 dillo * 1. Redistributions of source code must retain the above copyright 14 1.1 dillo * notice, this list of conditions and the following disclaimer. 15 1.1 dillo * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 dillo * notice, this list of conditions and the following disclaimer in the 17 1.1 dillo * documentation and/or other materials provided with the distribution. 18 1.1 dillo * 19 1.1 dillo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.1 dillo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.1 dillo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.1 dillo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.1 dillo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.1 dillo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.1 dillo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.1 dillo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.1 dillo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.1 dillo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.1 dillo * POSSIBILITY OF SUCH DAMAGE. 30 1.1 dillo */ 31 1.1 dillo 32 1.2 lukem #include <sys/cdefs.h> 33 1.3 maxv __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $"); 34 1.2 lukem 35 1.1 dillo #include <sys/null.h> 36 1.1 dillo 37 1.1 dillo #include "unicode.h" 38 1.1 dillo 39 1.1 dillo size_t 40 1.1 dillo utf8_to_utf16(uint16_t *dst, size_t dst_len, 41 1.1 dillo const char *src, size_t src_len, 42 1.1 dillo int flags, int *errp) 43 1.1 dillo { 44 1.3 maxv const unsigned char *s; 45 1.3 maxv size_t spos, dpos; 46 1.3 maxv int error; 47 1.3 maxv uint16_t c; 48 1.1 dillo 49 1.1 dillo #define IS_CONT(c) (((c)&0xc0) == 0x80) 50 1.1 dillo 51 1.3 maxv error = 0; 52 1.3 maxv s = (const unsigned char *)src; 53 1.3 maxv spos = dpos = 0; 54 1.3 maxv while (spos < src_len) { 55 1.3 maxv if (s[spos] < 0x80) { 56 1.3 maxv c = s[spos++]; 57 1.3 maxv } else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) 58 1.3 maxv && (spos >= src_len || !IS_CONT(s[spos+1])) 59 1.3 maxv && s[spos]>=0xa0) { 60 1.3 maxv /* not valid UTF-8, assume ISO 8859-1 */ 61 1.3 maxv c = s[spos++]; 62 1.3 maxv } else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { 63 1.3 maxv /* continuation byte without lead byte 64 1.3 maxv * or lead byte for codepoint above 0x10ffff */ 65 1.3 maxv error++; 66 1.3 maxv spos++; 67 1.3 maxv continue; 68 1.3 maxv } else if (s[spos] < 0xe0) { 69 1.3 maxv if (spos >= src_len || !IS_CONT(s[spos+1])) { 70 1.3 maxv spos++; 71 1.3 maxv error++; 72 1.3 maxv continue; 73 1.3 maxv } 74 1.3 maxv c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); 75 1.3 maxv spos += 2; 76 1.3 maxv if (c < 0x80) { 77 1.3 maxv /* overlong encoding */ 78 1.3 maxv error++; 79 1.3 maxv continue; 80 1.3 maxv } 81 1.3 maxv } else if (s[spos] < 0xf0) { 82 1.3 maxv if (spos >= src_len-2 || 83 1.3 maxv !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { 84 1.3 maxv spos++; 85 1.3 maxv error++; 86 1.3 maxv continue; 87 1.3 maxv } 88 1.3 maxv c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) 89 1.3 maxv | (s[spos+2] & 0x3f); 90 1.3 maxv spos += 3; 91 1.3 maxv if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { 92 1.3 maxv /* overlong encoding or encoded surrogate */ 93 1.3 maxv error++; 94 1.3 maxv continue; 95 1.3 maxv } 96 1.3 maxv } else { 97 1.3 maxv uint32_t cc; 98 1.3 maxv /* UTF-16 surrogate pair */ 99 1.3 maxv 100 1.3 maxv if (spos >= src_len-3 || !IS_CONT(s[spos+1]) 101 1.3 maxv || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { 102 1.3 maxv spos++; 103 1.3 maxv error++; 104 1.3 maxv continue; 105 1.3 maxv } 106 1.3 maxv cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) 107 1.3 maxv | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); 108 1.3 maxv spos += 4; 109 1.3 maxv if (cc < 0x10000) { 110 1.3 maxv /* overlong encoding */ 111 1.3 maxv error++; 112 1.3 maxv continue; 113 1.3 maxv } 114 1.3 maxv if (dst && dpos < dst_len) 115 1.3 maxv dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); 116 1.3 maxv dpos++; 117 1.3 maxv c = 0xdc00 | ((cc-0x10000) & 0x3ffff); 118 1.3 maxv } 119 1.3 maxv 120 1.3 maxv if (dst && dpos < dst_len) 121 1.3 maxv dst[dpos] = c; 122 1.3 maxv dpos++; 123 1.3 maxv } 124 1.3 maxv 125 1.3 maxv if (errp) 126 1.3 maxv *errp = error; 127 1.3 maxv return dpos; 128 1.1 dillo #undef IS_CONT 129 1.1 dillo } 130 1.1 dillo 131 1.1 dillo 132 1.1 dillo size_t 133 1.1 dillo utf16_to_utf8(char *dst, size_t dst_len, 134 1.1 dillo const uint16_t *src, size_t src_len, 135 1.1 dillo int flags, int *errp) 136 1.1 dillo { 137 1.3 maxv uint8_t spos, dpos; 138 1.3 maxv int error; 139 1.1 dillo 140 1.1 dillo #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) 141 1.1 dillo #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) 142 1.1 dillo 143 1.3 maxv error = 0; 144 1.3 maxv dpos = 0; 145 1.3 maxv for (spos = 0; spos < src_len; spos++) { 146 1.3 maxv if (src[spos] < 0x80) { 147 1.3 maxv CHECK_LENGTH(1); 148 1.3 maxv ADD_BYTE(src[spos]); 149 1.3 maxv } else if (src[spos] < 0x800) { 150 1.3 maxv CHECK_LENGTH(2); 151 1.3 maxv ADD_BYTE(0xc0 | (src[spos]>>6)); 152 1.3 maxv ADD_BYTE(0x80 | (src[spos] & 0x3f)); 153 1.3 maxv } else if ((src[spos] & 0xdc00) == 0xd800) { 154 1.3 maxv uint32_t c; 155 1.3 maxv /* first surrogate */ 156 1.3 maxv if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { 157 1.3 maxv /* no second surrogate present */ 158 1.3 maxv error++; 159 1.3 maxv continue; 160 1.3 maxv } 161 1.3 maxv spos++; 162 1.3 maxv CHECK_LENGTH(4); 163 1.3 maxv c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; 164 1.3 maxv ADD_BYTE(0xf0 | (c>>18)); 165 1.3 maxv ADD_BYTE(0x80 | ((c>>12) & 0x3f)); 166 1.3 maxv ADD_BYTE(0x80 | ((c>>6) & 0x3f)); 167 1.3 maxv ADD_BYTE(0x80 | (c & 0x3f)); 168 1.3 maxv } else if ((src[spos] & 0xdc00) == 0xdc00) { 169 1.3 maxv /* second surrogate without preceding first surrogate */ 170 1.3 maxv error++; 171 1.3 maxv } else { 172 1.3 maxv CHECK_LENGTH(3); 173 1.3 maxv ADD_BYTE(0xe0 | src[spos]>>12); 174 1.3 maxv ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); 175 1.3 maxv ADD_BYTE(0x80 | (src[spos] & 0x3f)); 176 1.3 maxv } 177 1.3 maxv } 178 1.3 maxv 179 1.3 maxv if (errp) 180 1.3 maxv *errp = error; 181 1.3 maxv return dpos; 182 1.1 dillo #undef ADD_BYTE 183 1.1 dillo #undef CHECK_LENGTH 184 1.1 dillo } 185