Home | History | Annotate | Line # | Download | only in hfs
      1 /* $NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Dieter Baron.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $");
     34 
     35 #include <sys/null.h>
     36 
     37 #include "unicode.h"
     38 
     39 size_t
     40 utf8_to_utf16(uint16_t *dst, size_t dst_len,
     41 	      const char *src, size_t src_len,
     42 	      int flags, int *errp)
     43 {
     44 	const unsigned char *s;
     45 	size_t spos, dpos;
     46 	int error;
     47 	uint16_t c;
     48 
     49 #define IS_CONT(c)	(((c)&0xc0) == 0x80)
     50 
     51 	error = 0;
     52 	s = (const unsigned char *)src;
     53 	spos = dpos = 0;
     54 	while (spos < src_len) {
     55 		if (s[spos] < 0x80) {
     56 			c = s[spos++];
     57 		} else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
     58 			 && (spos >= src_len || !IS_CONT(s[spos+1]))
     59 			 && s[spos]>=0xa0) {
     60 			/* not valid UTF-8, assume ISO 8859-1 */
     61 			c = s[spos++];
     62 		} else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
     63 			/* continuation byte without lead byte
     64 			 * or lead byte for codepoint above 0x10ffff */
     65 			error++;
     66 			spos++;
     67 			continue;
     68 		} else if (s[spos] < 0xe0) {
     69 			if (spos >= src_len || !IS_CONT(s[spos+1])) {
     70 				spos++;
     71 				error++;
     72 				continue;
     73 			}
     74 			c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
     75 			spos += 2;
     76 			if (c < 0x80) {
     77 				/* overlong encoding */
     78 				error++;
     79 				continue;
     80 			}
     81 		} else if (s[spos] < 0xf0) {
     82 			if (spos >= src_len-2 ||
     83 			    !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
     84 				spos++;
     85 				error++;
     86 				continue;
     87 			}
     88 			c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
     89 			    | (s[spos+2] & 0x3f);
     90 			spos += 3;
     91 			if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
     92 				/* overlong encoding or encoded surrogate */
     93 				error++;
     94 				continue;
     95 			}
     96 		} else {
     97 			uint32_t cc;
     98 			/* UTF-16 surrogate pair */
     99 
    100 			if (spos >= src_len-3 || !IS_CONT(s[spos+1])
    101 			    || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
    102 				spos++;
    103 				error++;
    104 				continue;
    105 			}
    106 			cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
    107 			    | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
    108 			spos += 4;
    109 			if (cc < 0x10000) {
    110 				/* overlong encoding */
    111 				error++;
    112 				continue;
    113 			}
    114 			if (dst && dpos < dst_len)
    115 				dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
    116 			dpos++;
    117 			c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
    118 		}
    119 
    120 		if (dst && dpos < dst_len)
    121 			dst[dpos] = c;
    122 		dpos++;
    123 	}
    124 
    125 	if (errp)
    126 		*errp = error;
    127 	return dpos;
    128 #undef IS_CONT
    129 }
    130 
    131 
    132 size_t
    133 utf16_to_utf8(char *dst, size_t dst_len,
    134 	      const uint16_t *src, size_t src_len,
    135 	      int flags, int *errp)
    136 {
    137 	uint8_t spos, dpos;
    138 	int error;
    139 
    140 #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
    141 #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
    142 
    143 	error = 0;
    144 	dpos = 0;
    145 	for (spos = 0; spos < src_len; spos++) {
    146 		if (src[spos] < 0x80) {
    147 			CHECK_LENGTH(1);
    148 			ADD_BYTE(src[spos]);
    149 		} else if (src[spos] < 0x800) {
    150 			CHECK_LENGTH(2);
    151 			ADD_BYTE(0xc0 | (src[spos]>>6));
    152 			ADD_BYTE(0x80 | (src[spos] & 0x3f));
    153 		} else if ((src[spos] & 0xdc00) == 0xd800) {
    154 			uint32_t c;
    155 			/* first surrogate */
    156 			if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
    157 				/* no second surrogate present */
    158 				error++;
    159 				continue;
    160 			}
    161 			spos++;
    162 			CHECK_LENGTH(4);
    163 			c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
    164 			ADD_BYTE(0xf0 | (c>>18));
    165 			ADD_BYTE(0x80 | ((c>>12) & 0x3f));
    166 			ADD_BYTE(0x80 | ((c>>6) & 0x3f));
    167 			ADD_BYTE(0x80 | (c & 0x3f));
    168 		} else if ((src[spos] & 0xdc00) == 0xdc00) {
    169 			/* second surrogate without preceding first surrogate */
    170 			error++;
    171 		} else {
    172 			CHECK_LENGTH(3);
    173 			ADD_BYTE(0xe0 | src[spos]>>12);
    174 			ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
    175 			ADD_BYTE(0x80 | (src[spos] & 0x3f));
    176 		}
    177 	}
    178 
    179 	if (errp)
    180 		*errp = error;
    181 	return dpos;
    182 #undef ADD_BYTE
    183 #undef CHECK_LENGTH
    184 }
    185