Home | History | Annotate | Line # | Download | only in locale
c8rtomb.c revision 1.9.6.2
      1  1.9.6.2  perseant /*	$NetBSD: c8rtomb.c,v 1.9.6.2 2025/08/02 05:54:38 perseant Exp $	*/
      2  1.9.6.2  perseant 
      3  1.9.6.2  perseant /*-
      4  1.9.6.2  perseant  * Copyright (c) 2024 The NetBSD Foundation, Inc.
      5  1.9.6.2  perseant  * All rights reserved.
      6  1.9.6.2  perseant  *
      7  1.9.6.2  perseant  * Redistribution and use in source and binary forms, with or without
      8  1.9.6.2  perseant  * modification, are permitted provided that the following conditions
      9  1.9.6.2  perseant  * are met:
     10  1.9.6.2  perseant  * 1. Redistributions of source code must retain the above copyright
     11  1.9.6.2  perseant  *    notice, this list of conditions and the following disclaimer.
     12  1.9.6.2  perseant  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.9.6.2  perseant  *    notice, this list of conditions and the following disclaimer in the
     14  1.9.6.2  perseant  *    documentation and/or other materials provided with the distribution.
     15  1.9.6.2  perseant  *
     16  1.9.6.2  perseant  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.9.6.2  perseant  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.9.6.2  perseant  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.9.6.2  perseant  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.9.6.2  perseant  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.9.6.2  perseant  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.9.6.2  perseant  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.9.6.2  perseant  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.9.6.2  perseant  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.9.6.2  perseant  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.9.6.2  perseant  * POSSIBILITY OF SUCH DAMAGE.
     27  1.9.6.2  perseant  */
     28  1.9.6.2  perseant 
     29  1.9.6.2  perseant /*
     30  1.9.6.2  perseant  * c8rtomb(s, c8, ps)
     31  1.9.6.2  perseant  *
     32  1.9.6.2  perseant  *	Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
     33  1.9.6.2  perseant  *	s under the current locale, using multibyte encoding state ps.
     34  1.9.6.2  perseant  *
     35  1.9.6.2  perseant  *	If c8 is not the last byte of a UTF-8 scalar value sequence, no
     36  1.9.6.2  perseant  *	output will be produced, but c8 will be remembered; this must
     37  1.9.6.2  perseant  *	be followed by another call passing the following bytes.
     38  1.9.6.2  perseant  *
     39  1.9.6.2  perseant  *	Return the number of bytes stored on success, or (size_t)-1 on
     40  1.9.6.2  perseant  *	error with errno set to EILSEQ.
     41  1.9.6.2  perseant  *
     42  1.9.6.2  perseant  *	At most MB_CUR_MAX bytes will be stored.
     43  1.9.6.2  perseant  *
     44  1.9.6.2  perseant  * References:
     45  1.9.6.2  perseant  *
     46  1.9.6.2  perseant  *	The Unicode Standard, Version 15.0 -- Core Specification, The
     47  1.9.6.2  perseant  *	Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
     48  1.9.6.2  perseant  *	p. 124.
     49  1.9.6.2  perseant  *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
     50  1.9.6.2  perseant  *	https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
     51  1.9.6.2  perseant  *
     52  1.9.6.2  perseant  *	F. Yergeau, `UTF-8, a transformation format of ISO 10646',
     53  1.9.6.2  perseant  *	RFC 3629, Internet Engineering Task Force, November 2003.
     54  1.9.6.2  perseant  *	https://datatracker.ietf.org/doc/html/rfc3629
     55  1.9.6.2  perseant  */
     56  1.9.6.2  perseant 
     57  1.9.6.2  perseant #include <sys/cdefs.h>
     58  1.9.6.2  perseant __RCSID("$NetBSD: c8rtomb.c,v 1.9.6.2 2025/08/02 05:54:38 perseant Exp $");
     59  1.9.6.2  perseant 
     60  1.9.6.2  perseant #include "namespace.h"
     61  1.9.6.2  perseant 
     62  1.9.6.2  perseant #include <assert.h>
     63  1.9.6.2  perseant #include <errno.h>
     64  1.9.6.2  perseant #include <limits.h>
     65  1.9.6.2  perseant #include <locale.h>
     66  1.9.6.2  perseant #include <stdalign.h>
     67  1.9.6.2  perseant #include <stddef.h>
     68  1.9.6.2  perseant #include <stdint.h>
     69  1.9.6.2  perseant #include <uchar.h>
     70  1.9.6.2  perseant 
     71  1.9.6.2  perseant #include "c32rtomb.h"
     72  1.9.6.2  perseant #include "setlocale_local.h"
     73  1.9.6.2  perseant 
     74  1.9.6.2  perseant struct c8rtombstate {
     75  1.9.6.2  perseant 	char32_t	state_c32; /* 8-bit state and 24-bit buffer */
     76  1.9.6.2  perseant 	mbstate_t	mbs;
     77  1.9.6.2  perseant };
     78  1.9.6.2  perseant __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t));
     79  1.9.6.2  perseant __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) -
     80  1.9.6.2  perseant     offsetof(struct c8rtombstate, mbs));
     81  1.9.6.2  perseant __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t));
     82  1.9.6.2  perseant 
     83  1.9.6.2  perseant /*
     84  1.9.6.2  perseant  * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
     85  1.9.6.2  perseant  * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
     86  1.9.6.2  perseant  * from scratch.
     87  1.9.6.2  perseant  */
     88  1.9.6.2  perseant 
     89  1.9.6.2  perseant #define	UTF8_ACCEPT	0
     90  1.9.6.2  perseant #define	UTF8_REJECT	96
     91  1.9.6.2  perseant 
     92  1.9.6.2  perseant typedef uint8_t utf8_class_t;
     93  1.9.6.2  perseant typedef uint8_t utf8_state_t;
     94  1.9.6.2  perseant 
     95  1.9.6.2  perseant static const uint8_t utf8_classtab[] = {
     96  1.9.6.2  perseant     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     97  1.9.6.2  perseant     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     98  1.9.6.2  perseant     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     99  1.9.6.2  perseant     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    100  1.9.6.2  perseant     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
    101  1.9.6.2  perseant     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    102  1.9.6.2  perseant     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    103  1.9.6.2  perseant    11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
    104  1.9.6.2  perseant };
    105  1.9.6.2  perseant 
    106  1.9.6.2  perseant static const uint8_t utf8_statetab[] = {
    107  1.9.6.2  perseant      0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96,
    108  1.9.6.2  perseant     96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96,
    109  1.9.6.2  perseant     96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96,
    110  1.9.6.2  perseant     96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96,
    111  1.9.6.2  perseant     96,96,96,96,96,96,96,96,96,96,96,96,
    112  1.9.6.2  perseant };
    113  1.9.6.2  perseant 
    114  1.9.6.2  perseant static utf8_state_t
    115  1.9.6.2  perseant utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
    116  1.9.6.2  perseant {
    117  1.9.6.2  perseant 	const utf8_class_t class = utf8_classtab[c8];
    118  1.9.6.2  perseant 
    119  1.9.6.2  perseant 	*pc32 = (state == UTF8_ACCEPT
    120  1.9.6.2  perseant 	    ? (c8 & (0xff >> class))
    121  1.9.6.2  perseant 	    : ((c8 & 0x3f) | (*pc32 << 6)));
    122  1.9.6.2  perseant 
    123  1.9.6.2  perseant 	return utf8_statetab[state + class];
    124  1.9.6.2  perseant }
    125  1.9.6.2  perseant 
    126  1.9.6.2  perseant #ifdef __weak_alias
    127  1.9.6.2  perseant __weak_alias(c8rtomb_l,_c8rtomb_l)
    128  1.9.6.2  perseant #endif
    129  1.9.6.2  perseant 
    130  1.9.6.2  perseant size_t
    131  1.9.6.2  perseant c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
    132  1.9.6.2  perseant {
    133  1.9.6.2  perseant 
    134  1.9.6.2  perseant 	return c8rtomb_l(s, c8, ps, _current_locale());
    135  1.9.6.2  perseant }
    136  1.9.6.2  perseant 
    137  1.9.6.2  perseant size_t
    138  1.9.6.2  perseant c8rtomb_l(char *restrict s, char8_t c8, mbstate_t *restrict ps, locale_t loc)
    139  1.9.6.2  perseant {
    140  1.9.6.2  perseant 	static mbstate_t psbuf;
    141  1.9.6.2  perseant 	char buf[MB_LEN_MAX];
    142  1.9.6.2  perseant 	struct c8rtombstate *S;
    143  1.9.6.2  perseant 	utf8_state_t state;
    144  1.9.6.2  perseant 	char32_t c32;
    145  1.9.6.2  perseant 
    146  1.9.6.2  perseant 	/*
    147  1.9.6.2  perseant 	 * `If ps is a null pointer, each function uses its own
    148  1.9.6.2  perseant 	 *  internal mbstate_t object instead, which is initialized at
    149  1.9.6.2  perseant 	 *  program startup to the initial conversion state; the
    150  1.9.6.2  perseant 	 *  functions are not required to avoid data races with other
    151  1.9.6.2  perseant 	 *  calls to the same function in this case.  The
    152  1.9.6.2  perseant 	 *  implementation behaves as if no library function calls
    153  1.9.6.2  perseant 	 *  these functions with a null pointer for ps.'
    154  1.9.6.2  perseant 	 */
    155  1.9.6.2  perseant 	if (ps == NULL)
    156  1.9.6.2  perseant 		ps = &psbuf;
    157  1.9.6.2  perseant 
    158  1.9.6.2  perseant 	/*
    159  1.9.6.2  perseant 	 * `If s is a null pointer, the c8rtomb function is equivalent
    160  1.9.6.2  perseant 	 *  to the call
    161  1.9.6.2  perseant 	 *
    162  1.9.6.2  perseant 	 *	c8rtomb(buf, u8'\0', ps)
    163  1.9.6.2  perseant 	 *
    164  1.9.6.2  perseant 	 *  where buf is an internal buffer.
    165  1.9.6.2  perseant 	 */
    166  1.9.6.2  perseant 	if (s == NULL) {
    167  1.9.6.2  perseant 		s = buf;
    168  1.9.6.2  perseant 		c8 = 0;		/* XXX u8'\0' */
    169  1.9.6.2  perseant 	}
    170  1.9.6.2  perseant 
    171  1.9.6.2  perseant 	/*
    172  1.9.6.2  perseant 	 * Open the private UTF-8 decoding state.
    173  1.9.6.2  perseant 	 */
    174  1.9.6.2  perseant 	S = (struct c8rtombstate *)(void *)ps;
    175  1.9.6.2  perseant 
    176  1.9.6.2  perseant 	/*
    177  1.9.6.2  perseant 	 * `If c8 is a null character, a null byte is stored, preceded
    178  1.9.6.2  perseant 	 *  by any shift sequence needed to restore the initial shift
    179  1.9.6.2  perseant 	 *  state; the resulting state described is the initial
    180  1.9.6.2  perseant 	 *  conversion state.'
    181  1.9.6.2  perseant 	 *
    182  1.9.6.2  perseant 	 * So if c8 is null, discard any buffered input -- there's
    183  1.9.6.2  perseant 	 * nothing we can legitimately do with it -- and convert a null
    184  1.9.6.2  perseant 	 * scalar value, which by definition of c32rtomb writes out any
    185  1.9.6.2  perseant 	 * shift sequence reset followed by a null byte.
    186  1.9.6.2  perseant 	 */
    187  1.9.6.2  perseant 	if (c8 == '\0') {
    188  1.9.6.2  perseant 		c32 = 0;
    189  1.9.6.2  perseant 		goto accept;
    190  1.9.6.2  perseant 	}
    191  1.9.6.2  perseant 
    192  1.9.6.2  perseant 	/*
    193  1.9.6.2  perseant 	 * Get the current state and buffer.
    194  1.9.6.2  perseant 	 */
    195  1.9.6.2  perseant 	__CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */
    196  1.9.6.2  perseant 	state = __SHIFTOUT(S->state_c32, __BITS(31,24));
    197  1.9.6.2  perseant 	c32 = __SHIFTOUT(S->state_c32, __BITS(23,0));
    198  1.9.6.2  perseant 
    199  1.9.6.2  perseant 	/*
    200  1.9.6.2  perseant 	 * Feed the byte into the state machine to update the state.
    201  1.9.6.2  perseant 	 */
    202  1.9.6.2  perseant 	state = utf8_decode_step(state, c8, &c32);
    203  1.9.6.2  perseant 	switch (state) {
    204  1.9.6.2  perseant 	case UTF8_REJECT:
    205  1.9.6.2  perseant 		/*
    206  1.9.6.2  perseant 		 * Invalid UTF-8.  Fail with EILSEQ.
    207  1.9.6.2  perseant 		 */
    208  1.9.6.2  perseant 		errno = EILSEQ;
    209  1.9.6.2  perseant 		return (size_t)-1;
    210  1.9.6.2  perseant 	default:
    211  1.9.6.2  perseant 		/*
    212  1.9.6.2  perseant 		 * Valid UTF-8 so far but incomplete.  Update state and
    213  1.9.6.2  perseant 		 * output nothing.
    214  1.9.6.2  perseant 		 */
    215  1.9.6.2  perseant 		S->state_c32 =
    216  1.9.6.2  perseant 		    __SHIFTIN(state, __BITS(31,24)) |
    217  1.9.6.2  perseant 		    __SHIFTIN(c32, __BITS(23,0));
    218  1.9.6.2  perseant 		return 0;
    219  1.9.6.2  perseant 	case UTF8_ACCEPT:
    220  1.9.6.2  perseant 	accept:
    221  1.9.6.2  perseant 		/*
    222  1.9.6.2  perseant 		 * We have a scalar value.  Clear the state and output
    223  1.9.6.2  perseant 		 * the scalar value.
    224  1.9.6.2  perseant 		 */
    225  1.9.6.2  perseant 		S->state_c32 = 0;
    226  1.9.6.2  perseant 		return c32rtomb_l(s, c32, &S->mbs, loc);
    227  1.9.6.2  perseant 	}
    228  1.9.6.2  perseant }
    229