Home | History | Annotate | Line # | Download | only in wind
      1 /*	$NetBSD: utf8.c,v 1.3 2023/06/19 21:41:45 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Hgskolan
      5  * (Royal Institute of Technology, Stockholm, Sweden).
      6  * All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  *
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  *
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * 3. Neither the name of the Institute nor the names of its contributors
     20  *    may be used to endorse or promote products derived from this software
     21  *    without specific prior written permission.
     22  *
     23  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
     24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
     27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     33  * SUCH DAMAGE.
     34  */
     35 
     36 #include <config.h>
     37 #include "windlocl.h"
     38 
     39 static int
     40 utf8toutf32(const unsigned char **pp, uint32_t *out)
     41 {
     42     const unsigned char *p = *pp;
     43     unsigned c = *p;
     44 
     45     if (c & 0x80) {
     46 	if ((c & 0xE0) == 0xC0) {
     47 	    const unsigned c2 = *++p;
     48 	    if ((c2 & 0xC0) == 0x80) {
     49 		*out =  ((c  & 0x1F) << 6)
     50 		    | (c2 & 0x3F);
     51 	    } else {
     52 		return WIND_ERR_INVALID_UTF8;
     53 	    }
     54 	} else if ((c & 0xF0) == 0xE0) {
     55 	    const unsigned c2 = *++p;
     56 	    if ((c2 & 0xC0) == 0x80) {
     57 		const unsigned c3 = *++p;
     58 		if ((c3 & 0xC0) == 0x80) {
     59 		    *out =   ((c  & 0x0F) << 12)
     60 			| ((c2 & 0x3F) << 6)
     61 			|  (c3 & 0x3F);
     62 		} else {
     63 		    return WIND_ERR_INVALID_UTF8;
     64 		}
     65 	    } else {
     66 		return WIND_ERR_INVALID_UTF8;
     67 	    }
     68 	} else if ((c & 0xF8) == 0xF0) {
     69 	    const unsigned c2 = *++p;
     70 	    if ((c2 & 0xC0) == 0x80) {
     71 		const unsigned c3 = *++p;
     72 		if ((c3 & 0xC0) == 0x80) {
     73 		    const unsigned c4 = *++p;
     74 		    if ((c4 & 0xC0) == 0x80) {
     75 			*out =   ((c  & 0x07) << 18)
     76 			    | ((c2 & 0x3F) << 12)
     77 			    | ((c3 & 0x3F) <<  6)
     78 			    |  (c4 & 0x3F);
     79 		    } else {
     80 			return WIND_ERR_INVALID_UTF8;
     81 		    }
     82 		} else {
     83 		    return WIND_ERR_INVALID_UTF8;
     84 		}
     85 	    } else {
     86 		return WIND_ERR_INVALID_UTF8;
     87 	    }
     88 	} else {
     89 	    return WIND_ERR_INVALID_UTF8;
     90 	}
     91     } else {
     92 	*out = c;
     93     }
     94 
     95     *pp = p;
     96 
     97     return 0;
     98 }
     99 
    100 /**
    101  * Convert an UTF-8 string to an UCS4 string.
    102  *
    103  * @param in an UTF-8 string to convert.
    104  * @param out the resulting UCS4 strint, must be at least
    105  * wind_utf8ucs4_length() long.  If out is NULL, the function will
    106  * calculate the needed space for the out variable (just like
    107  * wind_utf8ucs4_length()).
    108  * @param out_len before processing out_len should be the length of
    109  * the out variable, after processing it will be the length of the out
    110  * string.
    111  *
    112  * @return returns 0 on success, an wind error code otherwise
    113  * @ingroup wind
    114  */
    115 
    116 int
    117 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
    118 {
    119     const unsigned char *p;
    120     size_t o = 0;
    121     int ret;
    122 
    123     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
    124 	uint32_t u;
    125 
    126 	ret = utf8toutf32(&p, &u);
    127 	if (ret)
    128 	    return ret;
    129 
    130 	if (out) {
    131 	    if (o >= *out_len)
    132 		return WIND_ERR_OVERRUN;
    133 	    out[o] = u;
    134 	}
    135 	o++;
    136     }
    137     *out_len = o;
    138     return 0;
    139 }
    140 
    141 /**
    142  * Calculate the length of from converting a UTF-8 string to a UCS4
    143  * string.
    144  *
    145  * @param in an UTF-8 string to convert.
    146  * @param out_len the length of the resulting UCS4 string.
    147  *
    148  * @return returns 0 on success, an wind error code otherwise
    149  * @ingroup wind
    150  */
    151 
    152 int
    153 wind_utf8ucs4_length(const char *in, size_t *out_len)
    154 {
    155     return wind_utf8ucs4(in, NULL, out_len);
    156 }
    157 
    158 static const char first_char[4] =
    159     { 0x00, 0xC0, 0xE0, 0xF0 };
    160 
    161 /**
    162  * Convert an UCS4 string to a UTF-8 string.
    163  *
    164  * @param in an UCS4 string to convert.
    165  * @param in_len the length input array.
    166 
    167  * @param out the resulting UTF-8 strint, must be at least
    168  * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
    169  * out is NULL, the function will calculate the needed space for the
    170  * out variable (just like wind_ucs4utf8_length()).
    171 
    172  * @param out_len before processing out_len should be the length of
    173  * the out variable, after processing it will be the length of the out
    174  * string.
    175  *
    176  * @return returns 0 on success, an wind error code otherwise
    177  * @ingroup wind
    178  */
    179 
    180 int
    181 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
    182 {
    183     uint32_t ch;
    184     size_t i, len, o;
    185 
    186     for (o = 0, i = 0; i < in_len; i++) {
    187 	ch = in[i];
    188 
    189 	if (ch < 0x80) {
    190 	    len = 1;
    191 	} else if (ch < 0x800) {
    192 	    len = 2;
    193 	} else if (ch < 0x10000) {
    194 	    len = 3;
    195 	} else if (ch <= 0x10FFFF) {
    196 	    len = 4;
    197 	} else
    198 	    return WIND_ERR_INVALID_UTF32;
    199 
    200 	o += len;
    201 
    202 	if (out) {
    203 	    if (o >= *out_len)
    204 		return WIND_ERR_OVERRUN;
    205 
    206 	    switch(len) {
    207 	    case 4:
    208 		out[3] = (ch | 0x80) & 0xbf;
    209 		ch = ch >> 6;
    210                 /* FALLTHROUGH */
    211 	    case 3:
    212 		out[2] = (ch | 0x80) & 0xbf;
    213 		ch = ch >> 6;
    214                 /* FALLTHROUGH */
    215 	    case 2:
    216 		out[1] = (ch | 0x80) & 0xbf;
    217 		ch = ch >> 6;
    218                 /* FALLTHROUGH */
    219 	    case 1:
    220 		out[0] = ch | first_char[len - 1];
    221                 /* FALLTHROUGH */
    222 	    }
    223 	}
    224 	out += len;
    225     }
    226     if (out) {
    227 	if (o + 1 >= *out_len)
    228 	    return WIND_ERR_OVERRUN;
    229 	*out = '\0';
    230     }
    231     *out_len = o;
    232     return 0;
    233 }
    234 
    235 /**
    236  * Calculate the length of from converting a UCS4 string to an UTF-8 string.
    237  *
    238  * @param in an UCS4 string to convert.
    239  * @param in_len the length of UCS4 string to convert.
    240  * @param out_len the length of the resulting UTF-8 string.
    241  *
    242  * @return returns 0 on success, an wind error code otherwise
    243  * @ingroup wind
    244  */
    245 
    246 int
    247 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
    248 {
    249     return wind_ucs4utf8(in, in_len, NULL, out_len);
    250 }
    251 
    252 /**
    253  * Read in an UCS2 from a buffer.
    254  *
    255  * @param ptr The input buffer to read from.
    256  * @param len the length of the input buffer.
    257  * @param flags Flags to control the behavior of the function.
    258  * @param out the output UCS2, the array must be at least out/2 long.
    259  * @param out_len the output length
    260  *
    261  * @return returns 0 on success, an wind error code otherwise.
    262  * @ingroup wind
    263  */
    264 
    265 int
    266 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
    267 	      uint16_t *out, size_t *out_len)
    268 {
    269     const unsigned char *p = ptr;
    270     int little = ((*flags) & WIND_RW_LE);
    271     size_t olen = *out_len;
    272 
    273     /** if len is zero, flags are unchanged */
    274     if (len == 0) {
    275 	*out_len = 0;
    276 	return 0;
    277     }
    278 
    279     /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
    280     if (len & 1)
    281 	return WIND_ERR_LENGTH_NOT_MOD2;
    282 
    283     /**
    284      * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
    285      * found, check is LE/BE flag is already and use that otherwise
    286      * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
    287      * the LE/BE flag and set the resulting LE/BE flag.
    288      */
    289     if ((*flags) & WIND_RW_BOM) {
    290 	uint16_t bom = (p[0] << 8) + p[1];
    291 	if (bom == 0xfffe || bom == 0xfeff) {
    292 	    little = (bom == 0xfffe);
    293 	    p += 2;
    294 	    len -= 2;
    295 	} else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
    296 	    /* little already set */
    297 	} else
    298 	    return WIND_ERR_NO_BOM;
    299 	*flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
    300 	*flags |= little ? WIND_RW_LE : WIND_RW_BE;
    301     }
    302 
    303     while (len) {
    304 	if (olen < 1)
    305 	    return WIND_ERR_OVERRUN;
    306 	if (little)
    307 	    *out = (p[1] << 8) + p[0];
    308 	else
    309 	    *out = (p[0] << 8) + p[1];
    310 	out++; p += 2; len -= 2; olen--;
    311     }
    312     *out_len -= olen;
    313     return 0;
    314 }
    315 
    316 /**
    317  * Write an UCS2 string to a buffer.
    318  *
    319  * @param in The input UCS2 string.
    320  * @param in_len the length of the input buffer.
    321  * @param flags Flags to control the behavior of the function.
    322  * @param ptr The input buffer to write to, the array must be at least
    323  * (in + 1) * 2 bytes long.
    324  * @param out_len the output length
    325  *
    326  * @return returns 0 on success, an wind error code otherwise.
    327  * @ingroup wind
    328  */
    329 
    330 int
    331 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
    332 	       void *ptr, size_t *out_len)
    333 {
    334     unsigned char *p = ptr;
    335     size_t len = *out_len;
    336 
    337     /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
    338     if (len & 1)
    339 	return WIND_ERR_LENGTH_NOT_MOD2;
    340 
    341     /** On zero input length, flags are preserved */
    342     if (in_len == 0) {
    343 	*out_len = 0;
    344 	return 0;
    345     }
    346     /** If flags have WIND_RW_BOM set, the byte order mark is written
    347      * first to the output data */
    348     if ((*flags) & WIND_RW_BOM) {
    349 	uint16_t bom = 0xfffe;
    350 
    351 	if (len < 2)
    352 	    return WIND_ERR_OVERRUN;
    353 
    354 	if ((*flags) & WIND_RW_LE) {
    355 	    p[0] = (bom     ) & 0xff;
    356 	    p[1] = (bom >> 8) & 0xff;
    357 	} else {
    358 	    p[1] = (bom     ) & 0xff;
    359 	    p[0] = (bom >> 8) & 0xff;
    360 	}
    361 	len -= 2;
    362     }
    363 
    364     while (in_len) {
    365 	/** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
    366 	if (len < 2)
    367 	    return WIND_ERR_OVERRUN;
    368 	if ((*flags) & WIND_RW_LE) {
    369 	    p[0] = (in[0]     ) & 0xff;
    370 	    p[1] = (in[0] >> 8) & 0xff;
    371 	} else {
    372 	    p[1] = (in[0]     ) & 0xff;
    373 	    p[0] = (in[0] >> 8) & 0xff;
    374 	}
    375 	len -= 2;
    376 	in_len--;
    377 	p += 2;
    378 	in++;
    379     }
    380     *out_len -= len;
    381     return 0;
    382 }
    383 
    384 
    385 /**
    386  * Convert an UTF-8 string to an UCS2 string.
    387  *
    388  * @param in an UTF-8 string to convert.
    389  * @param out the resulting UCS2 strint, must be at least
    390  * wind_utf8ucs2_length() long.  If out is NULL, the function will
    391  * calculate the needed space for the out variable (just like
    392  * wind_utf8ucs2_length()).
    393  * @param out_len before processing out_len should be the length of
    394  * the out variable, after processing it will be the length of the out
    395  * string.
    396  *
    397  * @return returns 0 on success, an wind error code otherwise
    398  * @ingroup wind
    399  */
    400 
    401 int
    402 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
    403 {
    404     const unsigned char *p;
    405     size_t o = 0;
    406     int ret;
    407 
    408     for (p = (const unsigned char *)in; *p != '\0'; ++p) {
    409 	uint32_t u;
    410 
    411 	ret = utf8toutf32(&p, &u);
    412 	if (ret)
    413 	    return ret;
    414 
    415 	if (u & 0xffff0000)
    416 	    return WIND_ERR_NOT_UTF16;
    417 
    418 	if (out) {
    419 	    if (o >= *out_len)
    420 		return WIND_ERR_OVERRUN;
    421 	    out[o] = u;
    422 	}
    423 	o++;
    424     }
    425     *out_len = o;
    426     return 0;
    427 }
    428 
    429 /**
    430  * Calculate the length of from converting a UTF-8 string to a UCS2
    431  * string.
    432  *
    433  * @param in an UTF-8 string to convert.
    434  * @param out_len the length of the resulting UCS4 string.
    435  *
    436  * @return returns 0 on success, an wind error code otherwise
    437  * @ingroup wind
    438  */
    439 
    440 int
    441 wind_utf8ucs2_length(const char *in, size_t *out_len)
    442 {
    443     return wind_utf8ucs2(in, NULL, out_len);
    444 }
    445 
    446 /**
    447  * Convert an UCS2 string to a UTF-8 string.
    448  *
    449  * @param in an UCS2 string to convert.
    450  * @param in_len the length of the in UCS2 string.
    451  * @param out the resulting UTF-8 strint, must be at least
    452  * wind_ucs2utf8_length() long.  If out is NULL, the function will
    453  * calculate the needed space for the out variable (just like
    454  * wind_ucs2utf8_length()).
    455  * @param out_len before processing out_len should be the length of
    456  * the out variable, after processing it will be the length of the out
    457  * string.
    458  *
    459  * @return returns 0 on success, an wind error code otherwise
    460  * @ingroup wind
    461  */
    462 
    463 int
    464 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
    465 {
    466     uint16_t ch;
    467     size_t i, len, o;
    468 
    469     for (o = 0, i = 0; i < in_len; i++) {
    470 	ch = in[i];
    471 
    472 	if (ch < 0x80) {
    473 	    len = 1;
    474 	} else if (ch < 0x800) {
    475 	    len = 2;
    476 	} else
    477 	    len = 3;
    478 
    479 	o += len;
    480 
    481 	if (out) {
    482 	    if (o >= *out_len)
    483 		return WIND_ERR_OVERRUN;
    484 
    485 	    switch(len) {
    486 	    case 3:
    487 		out[2] = (ch | 0x80) & 0xbf;
    488 		ch = ch >> 6;
    489                 /* FALLTHROUGH */
    490 	    case 2:
    491 		out[1] = (ch | 0x80) & 0xbf;
    492 		ch = ch >> 6;
    493                 /* FALLTHROUGH */
    494 	    case 1:
    495 		out[0] = ch | first_char[len - 1];
    496                 /* FALLTHROUGH */
    497 	    }
    498 	    out += len;
    499 	}
    500     }
    501     if (out) {
    502 	if (o >= *out_len)
    503 	    return WIND_ERR_OVERRUN;
    504 	*out = '\0';
    505     }
    506     *out_len = o;
    507     return 0;
    508 }
    509 
    510 /**
    511  * Calculate the length of from converting a UCS2 string to an UTF-8 string.
    512  *
    513  * @param in an UCS2 string to convert.
    514  * @param in_len an UCS2 string length to convert.
    515  * @param out_len the length of the resulting UTF-8 string.
    516  *
    517  * @return returns 0 on success, an wind error code otherwise
    518  * @ingroup wind
    519  */
    520 
    521 int
    522 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
    523 {
    524     return wind_ucs2utf8(in, in_len, NULL, out_len);
    525 }
    526