Home | History | Annotate | Line # | Download | only in hfs
unicode.c revision 1.1.1.1.32.1
      1  1.1.1.1.32.1  bouyer /* $NetBSD: unicode.c,v 1.1.1.1.32.1 2007/12/13 21:56:51 bouyer Exp $ */
      2           1.1   dillo 
      3           1.1   dillo /*-
      4           1.1   dillo  * Copyright (c) 2007 The NetBSD Foundation, Inc.
      5           1.1   dillo  * All rights reserved.
      6           1.1   dillo  *
      7           1.1   dillo  * This code is derived from software contributed to The NetBSD Foundation
      8           1.1   dillo  * by Dieter Baron.
      9           1.1   dillo  *
     10           1.1   dillo  * Redistribution and use in source and binary forms, with or without
     11           1.1   dillo  * modification, are permitted provided that the following conditions
     12           1.1   dillo  * are met:
     13           1.1   dillo  * 1. Redistributions of source code must retain the above copyright
     14           1.1   dillo  *    notice, this list of conditions and the following disclaimer.
     15           1.1   dillo  * 2. Redistributions in binary form must reproduce the above copyright
     16           1.1   dillo  *    notice, this list of conditions and the following disclaimer in the
     17           1.1   dillo  *    documentation and/or other materials provided with the distribution.
     18           1.1   dillo  *
     19           1.1   dillo  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20           1.1   dillo  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21           1.1   dillo  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22           1.1   dillo  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23           1.1   dillo  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24           1.1   dillo  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25           1.1   dillo  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26           1.1   dillo  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27           1.1   dillo  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28           1.1   dillo  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29           1.1   dillo  * POSSIBILITY OF SUCH DAMAGE.
     30           1.1   dillo  */
     31           1.1   dillo 
     32  1.1.1.1.32.1  bouyer #include <sys/cdefs.h>
     33  1.1.1.1.32.1  bouyer __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.1.1.1.32.1 2007/12/13 21:56:51 bouyer Exp $");
     34  1.1.1.1.32.1  bouyer 
     35           1.1   dillo #include <sys/null.h>
     36           1.1   dillo 
     37           1.1   dillo #include "unicode.h"
     38           1.1   dillo 
     39           1.1   dillo size_t
     40           1.1   dillo utf8_to_utf16(uint16_t *dst, size_t dst_len,
     41           1.1   dillo 	      const char *src, size_t src_len,
     42           1.1   dillo 	      int flags, int *errp)
     43           1.1   dillo {
     44           1.1   dillo     const unsigned char *s;
     45           1.1   dillo     size_t spos, dpos;
     46           1.1   dillo     int error;
     47           1.1   dillo     uint16_t c;
     48           1.1   dillo 
     49           1.1   dillo #define IS_CONT(c)	(((c)&0xc0) == 0x80)
     50           1.1   dillo 
     51           1.1   dillo     error = 0;
     52           1.1   dillo     s = (const unsigned char *)src;
     53           1.1   dillo     spos = dpos = 0;
     54           1.1   dillo     while (spos<src_len) {
     55           1.1   dillo 	if (s[spos] < 0x80)
     56           1.1   dillo 	    c = s[spos++];
     57           1.1   dillo 	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
     58           1.1   dillo 		 && (spos >= src_len || !IS_CONT(s[spos+1]))
     59           1.1   dillo 		 && s[spos]>=0xa0) {
     60           1.1   dillo 	    /* not valid UTF-8, assume ISO 8859-1 */
     61           1.1   dillo 	    c = s[spos++];
     62           1.1   dillo 	}
     63           1.1   dillo 	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
     64           1.1   dillo 	    /* continuation byte without lead byte
     65           1.1   dillo 	       or lead byte for codepoint above 0x10ffff */
     66           1.1   dillo 	    error++;
     67           1.1   dillo 	    spos++;
     68           1.1   dillo 	    continue;
     69           1.1   dillo 	}
     70           1.1   dillo 	else if (s[spos] < 0xe0) {
     71           1.1   dillo 	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
     72           1.1   dillo 		spos++;
     73           1.1   dillo 		error++;
     74           1.1   dillo 		continue;
     75           1.1   dillo 	    }
     76           1.1   dillo 	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
     77           1.1   dillo 	    spos += 2;
     78           1.1   dillo 	    if (c < 0x80) {
     79           1.1   dillo 		/* overlong encoding */
     80           1.1   dillo 		error++;
     81           1.1   dillo 		continue;
     82           1.1   dillo 	    }
     83           1.1   dillo 	}
     84           1.1   dillo 	else if (s[spos] < 0xf0) {
     85           1.1   dillo 	    if (spos >= src_len-2
     86           1.1   dillo 		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
     87           1.1   dillo 		spos++;
     88           1.1   dillo 		error++;
     89           1.1   dillo 		continue;
     90           1.1   dillo 	    }
     91           1.1   dillo 	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
     92           1.1   dillo 		| (s[spos+2] & 0x3f);
     93           1.1   dillo 	    spos += 3;
     94           1.1   dillo 	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
     95           1.1   dillo 		/* overlong encoding or encoded surrogate */
     96           1.1   dillo 		error++;
     97           1.1   dillo 		continue;
     98           1.1   dillo 	    }
     99           1.1   dillo 	}
    100           1.1   dillo 	else {
    101           1.1   dillo 	    uint32_t cc;
    102           1.1   dillo 	    /* UTF-16 surrogate pair */
    103           1.1   dillo 
    104           1.1   dillo 	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
    105           1.1   dillo 		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
    106           1.1   dillo 		spos++;
    107           1.1   dillo 		error++;
    108           1.1   dillo 
    109           1.1   dillo 		continue;
    110           1.1   dillo 	    }
    111           1.1   dillo 	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
    112           1.1   dillo 		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
    113           1.1   dillo 	    spos += 4;
    114           1.1   dillo 	    if (cc < 0x10000) {
    115           1.1   dillo 		/* overlong encoding */
    116           1.1   dillo 		error++;
    117           1.1   dillo 		continue;
    118           1.1   dillo 	    }
    119           1.1   dillo 	    if (dst && dpos < dst_len)
    120           1.1   dillo 		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
    121           1.1   dillo 	    dpos++;
    122           1.1   dillo 	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
    123           1.1   dillo 	}
    124           1.1   dillo 
    125           1.1   dillo 	if (dst && dpos < dst_len)
    126           1.1   dillo 	    dst[dpos] = c;
    127           1.1   dillo 	dpos++;
    128           1.1   dillo     }
    129           1.1   dillo 
    130           1.1   dillo     if (errp)
    131           1.1   dillo 	*errp = error;
    132           1.1   dillo 
    133           1.1   dillo     return dpos;
    134           1.1   dillo 
    135           1.1   dillo #undef IS_CONT
    136           1.1   dillo }
    137           1.1   dillo 
    138           1.1   dillo 
    139           1.1   dillo size_t
    140           1.1   dillo utf16_to_utf8(char *dst, size_t dst_len,
    141           1.1   dillo 	      const uint16_t *src, size_t src_len,
    142           1.1   dillo 	      int flags, int *errp)
    143           1.1   dillo {
    144           1.1   dillo     uint8_t spos, dpos;
    145           1.1   dillo     int error;
    146           1.1   dillo 
    147           1.1   dillo #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
    148           1.1   dillo #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
    149           1.1   dillo 
    150           1.1   dillo     error = 0;
    151           1.1   dillo     dpos = 0;
    152           1.1   dillo     for (spos=0; spos<src_len; spos++) {
    153           1.1   dillo 	if (src[spos] < 0x80) {
    154           1.1   dillo 	    CHECK_LENGTH(1);
    155           1.1   dillo 	    ADD_BYTE(src[spos]);
    156           1.1   dillo 	}
    157           1.1   dillo 	else if (src[spos] < 0x800) {
    158           1.1   dillo 	    CHECK_LENGTH(2);
    159           1.1   dillo 	    ADD_BYTE(0xc0 | (src[spos]>>6));
    160           1.1   dillo 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
    161           1.1   dillo 	}
    162           1.1   dillo 	else if ((src[spos] & 0xdc00) == 0xd800) {
    163           1.1   dillo 	    uint32_t c;
    164           1.1   dillo 	    /* first surrogate */
    165           1.1   dillo 	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
    166           1.1   dillo 		/* no second surrogate present */
    167           1.1   dillo 		error++;
    168           1.1   dillo 		continue;
    169           1.1   dillo 	    }
    170           1.1   dillo 	    spos++;
    171           1.1   dillo 	    CHECK_LENGTH(4);
    172           1.1   dillo 	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
    173           1.1   dillo 	    ADD_BYTE(0xf0 | (c>>18));
    174           1.1   dillo 	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
    175           1.1   dillo 	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
    176           1.1   dillo 	    ADD_BYTE(0x80 | (c & 0x3f));
    177           1.1   dillo 	}
    178           1.1   dillo 	else if ((src[spos] & 0xdc00) == 0xdc00) {
    179           1.1   dillo 	    /* second surrogate without preceding first surrogate */
    180           1.1   dillo 	    error++;
    181           1.1   dillo 	}
    182           1.1   dillo 	else {
    183           1.1   dillo 	    CHECK_LENGTH(3);
    184           1.1   dillo 	    ADD_BYTE(0xe0 | src[spos]>>12);
    185           1.1   dillo 	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
    186           1.1   dillo 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
    187           1.1   dillo 	}
    188           1.1   dillo     }
    189           1.1   dillo 
    190           1.1   dillo     if (errp)
    191           1.1   dillo 	*errp = error;
    192           1.1   dillo 
    193           1.1   dillo     return dpos;
    194           1.1   dillo 
    195           1.1   dillo #undef ADD_BYTE
    196           1.1   dillo #undef CHECK_LENGTH
    197           1.1   dillo }
    198