Home | History | Annotate | Line # | Download | only in hfs
unicode.c revision 1.1
      1 /* $NetBSD: unicode.c,v 1.1 2007/03/06 00:10:39 dillo Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Dieter Baron.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/null.h>
     33 
     34 #include "unicode.h"
     35 
     36 size_t
     37 utf8_to_utf16(uint16_t *dst, size_t dst_len,
     38 	      const char *src, size_t src_len,
     39 	      int flags, int *errp)
     40 {
     41     const unsigned char *s;
     42     size_t spos, dpos;
     43     int error;
     44     uint16_t c;
     45 
     46 #define IS_CONT(c)	(((c)&0xc0) == 0x80)
     47 
     48     error = 0;
     49     s = (const unsigned char *)src;
     50     spos = dpos = 0;
     51     while (spos<src_len) {
     52 	if (s[spos] < 0x80)
     53 	    c = s[spos++];
     54 	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
     55 		 && (spos >= src_len || !IS_CONT(s[spos+1]))
     56 		 && s[spos]>=0xa0) {
     57 	    /* not valid UTF-8, assume ISO 8859-1 */
     58 	    c = s[spos++];
     59 	}
     60 	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
     61 	    /* continuation byte without lead byte
     62 	       or lead byte for codepoint above 0x10ffff */
     63 	    error++;
     64 	    spos++;
     65 	    continue;
     66 	}
     67 	else if (s[spos] < 0xe0) {
     68 	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
     69 		spos++;
     70 		error++;
     71 		continue;
     72 	    }
     73 	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
     74 	    spos += 2;
     75 	    if (c < 0x80) {
     76 		/* overlong encoding */
     77 		error++;
     78 		continue;
     79 	    }
     80 	}
     81 	else if (s[spos] < 0xf0) {
     82 	    if (spos >= src_len-2
     83 		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
     84 		spos++;
     85 		error++;
     86 		continue;
     87 	    }
     88 	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
     89 		| (s[spos+2] & 0x3f);
     90 	    spos += 3;
     91 	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
     92 		/* overlong encoding or encoded surrogate */
     93 		error++;
     94 		continue;
     95 	    }
     96 	}
     97 	else {
     98 	    uint32_t cc;
     99 	    /* UTF-16 surrogate pair */
    100 
    101 	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
    102 		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
    103 		spos++;
    104 		error++;
    105 
    106 		continue;
    107 	    }
    108 	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
    109 		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
    110 	    spos += 4;
    111 	    if (cc < 0x10000) {
    112 		/* overlong encoding */
    113 		error++;
    114 		continue;
    115 	    }
    116 	    if (dst && dpos < dst_len)
    117 		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
    118 	    dpos++;
    119 	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
    120 	}
    121 
    122 	if (dst && dpos < dst_len)
    123 	    dst[dpos] = c;
    124 	dpos++;
    125     }
    126 
    127     if (errp)
    128 	*errp = error;
    129 
    130     return dpos;
    131 
    132 #undef IS_CONT
    133 }
    134 
    135 
    136 size_t
    137 utf16_to_utf8(char *dst, size_t dst_len,
    138 	      const uint16_t *src, size_t src_len,
    139 	      int flags, int *errp)
    140 {
    141     uint8_t spos, dpos;
    142     int error;
    143 
    144 #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
    145 #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
    146 
    147     error = 0;
    148     dpos = 0;
    149     for (spos=0; spos<src_len; spos++) {
    150 	if (src[spos] < 0x80) {
    151 	    CHECK_LENGTH(1);
    152 	    ADD_BYTE(src[spos]);
    153 	}
    154 	else if (src[spos] < 0x800) {
    155 	    CHECK_LENGTH(2);
    156 	    ADD_BYTE(0xc0 | (src[spos]>>6));
    157 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
    158 	}
    159 	else if ((src[spos] & 0xdc00) == 0xd800) {
    160 	    uint32_t c;
    161 	    /* first surrogate */
    162 	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
    163 		/* no second surrogate present */
    164 		error++;
    165 		continue;
    166 	    }
    167 	    spos++;
    168 	    CHECK_LENGTH(4);
    169 	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
    170 	    ADD_BYTE(0xf0 | (c>>18));
    171 	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
    172 	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
    173 	    ADD_BYTE(0x80 | (c & 0x3f));
    174 	}
    175 	else if ((src[spos] & 0xdc00) == 0xdc00) {
    176 	    /* second surrogate without preceding first surrogate */
    177 	    error++;
    178 	}
    179 	else {
    180 	    CHECK_LENGTH(3);
    181 	    ADD_BYTE(0xe0 | src[spos]>>12);
    182 	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
    183 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
    184 	}
    185     }
    186 
    187     if (errp)
    188 	*errp = error;
    189 
    190     return dpos;
    191 
    192 #undef ADD_BYTE
    193 #undef CHECK_LENGTH
    194 }
    195