Home | History | Annotate | Line # | Download | only in locale
c8rtomb.c revision 1.2
      1  1.2  riastrad /*	$NetBSD: c8rtomb.c,v 1.2 2024/08/15 22:23:17 riastradh Exp $	*/
      2  1.1  riastrad 
      3  1.1  riastrad /*-
      4  1.1  riastrad  * Copyright (c) 2024 The NetBSD Foundation, Inc.
      5  1.1  riastrad  * All rights reserved.
      6  1.1  riastrad  *
      7  1.1  riastrad  * Redistribution and use in source and binary forms, with or without
      8  1.1  riastrad  * modification, are permitted provided that the following conditions
      9  1.1  riastrad  * are met:
     10  1.1  riastrad  * 1. Redistributions of source code must retain the above copyright
     11  1.1  riastrad  *    notice, this list of conditions and the following disclaimer.
     12  1.1  riastrad  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1  riastrad  *    notice, this list of conditions and the following disclaimer in the
     14  1.1  riastrad  *    documentation and/or other materials provided with the distribution.
     15  1.1  riastrad  *
     16  1.1  riastrad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1  riastrad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1  riastrad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1  riastrad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1  riastrad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1  riastrad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1  riastrad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1  riastrad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1  riastrad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1  riastrad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1  riastrad  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1  riastrad  */
     28  1.1  riastrad 
     29  1.1  riastrad /*
     30  1.1  riastrad  * c8rtomb(s, c8, ps)
     31  1.1  riastrad  *
     32  1.1  riastrad  *	Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
     33  1.1  riastrad  *	s under the current locale, using multibyte encoding state ps.
     34  1.1  riastrad  *
     35  1.1  riastrad  *	If c8 is not the last byte of a UTF-8 scalar value sequence, no
     36  1.1  riastrad  *	output will be produced, but c8 will be remembered; this must
     37  1.1  riastrad  *	be followed by another call passing the following bytes.
     38  1.1  riastrad  *
     39  1.1  riastrad  *	Return the number of bytes stored on success, or (size_t)-1 on
     40  1.1  riastrad  *	error with errno set to EILSEQ.
     41  1.1  riastrad  *
     42  1.1  riastrad  *	At most MB_CUR_MAX bytes will be stored.
     43  1.1  riastrad  *
     44  1.1  riastrad  * References:
     45  1.1  riastrad  *
     46  1.1  riastrad  *	The Unicode Standard, Version 15.0 -- Core Specification, The
     47  1.1  riastrad  *	Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
     48  1.1  riastrad  *	p. 124.
     49  1.1  riastrad  *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
     50  1.1  riastrad  *	https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
     51  1.1  riastrad  *
     52  1.1  riastrad  *	F. Yergeau, `UTF-8, a transformation format of ISO 10646',
     53  1.1  riastrad  *	RFC 3629, Internet Engineering Task Force, November 2003.
     54  1.1  riastrad  *	https://datatracker.ietf.org/doc/html/rfc3629
     55  1.1  riastrad  */
     56  1.1  riastrad 
     57  1.1  riastrad #include <sys/cdefs.h>
     58  1.2  riastrad __RCSID("$NetBSD: c8rtomb.c,v 1.2 2024/08/15 22:23:17 riastradh Exp $");
     59  1.2  riastrad 
     60  1.2  riastrad #include "namespace.h"
     61  1.1  riastrad 
     62  1.1  riastrad #include <assert.h>
     63  1.1  riastrad #include <errno.h>
     64  1.1  riastrad #include <limits.h>
     65  1.1  riastrad #include <stdalign.h>
     66  1.1  riastrad #include <stddef.h>
     67  1.1  riastrad #include <stdint.h>
     68  1.1  riastrad #include <uchar.h>
     69  1.1  riastrad 
     70  1.1  riastrad #include "c32rtomb.h"
     71  1.1  riastrad 
     72  1.1  riastrad struct c8rtombstate {
     73  1.1  riastrad 	char32_t	state_c32; /* 8-bit state and 24-bit buffer */
     74  1.1  riastrad 	mbstate_t	mbs;
     75  1.1  riastrad };
     76  1.1  riastrad __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t));
     77  1.1  riastrad __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) -
     78  1.1  riastrad     offsetof(struct c8rtombstate, mbs));
     79  1.1  riastrad __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t));
     80  1.1  riastrad 
     81  1.1  riastrad /*
     82  1.1  riastrad  * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
     83  1.1  riastrad  * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
     84  1.1  riastrad  * from scratch.
     85  1.1  riastrad  */
     86  1.1  riastrad 
     87  1.1  riastrad #define	UTF8_ACCEPT	0
     88  1.1  riastrad #define	UTF8_REJECT	96
     89  1.1  riastrad 
     90  1.1  riastrad typedef uint_fast8_t utf8_class_t;
     91  1.1  riastrad typedef uint_fast8_t utf8_state_t;
     92  1.1  riastrad 
     93  1.1  riastrad static uint8_t utf8_classtab[] = {
     94  1.1  riastrad     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     95  1.1  riastrad     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     96  1.1  riastrad     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     97  1.1  riastrad     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     98  1.1  riastrad     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
     99  1.1  riastrad     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    100  1.1  riastrad     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    101  1.1  riastrad    11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
    102  1.1  riastrad };
    103  1.1  riastrad 
    104  1.1  riastrad static uint8_t utf8_statetab[] = {
    105  1.1  riastrad      0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96,
    106  1.1  riastrad     96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96,
    107  1.1  riastrad     96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96,
    108  1.1  riastrad     96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96,
    109  1.1  riastrad     96,96,96,96,96,96,96,96,96,96,96,96,
    110  1.1  riastrad };
    111  1.1  riastrad 
    112  1.1  riastrad static utf8_state_t
    113  1.1  riastrad utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
    114  1.1  riastrad {
    115  1.1  riastrad 	const utf8_class_t class = utf8_classtab[c8];
    116  1.1  riastrad 
    117  1.1  riastrad 	*pc32 = (state == UTF8_ACCEPT
    118  1.1  riastrad 	    ? (c8 & (0xff >> class))
    119  1.1  riastrad 	    : ((c8 & 0x3f) | (*pc32 << 6)));
    120  1.1  riastrad 
    121  1.1  riastrad 	return utf8_statetab[state + class];
    122  1.1  riastrad }
    123  1.1  riastrad 
    124  1.1  riastrad size_t
    125  1.1  riastrad c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
    126  1.1  riastrad {
    127  1.1  riastrad 	static mbstate_t psbuf;
    128  1.1  riastrad 	char buf[MB_LEN_MAX];
    129  1.1  riastrad 	struct c8rtombstate *S;
    130  1.1  riastrad 	utf8_state_t state;
    131  1.1  riastrad 	char32_t c32;
    132  1.1  riastrad 
    133  1.1  riastrad 	/*
    134  1.1  riastrad 	 * `If ps is a null pointer, each function uses its own
    135  1.1  riastrad 	 *  internal mbstate_t object instead, which is initialized at
    136  1.1  riastrad 	 *  program startup to the initial conversion state; the
    137  1.1  riastrad 	 *  functions are not required to avoid data races with other
    138  1.1  riastrad 	 *  calls to the same function in this case.  The
    139  1.1  riastrad 	 *  implementation behaves as if no library function calls
    140  1.1  riastrad 	 *  these functions with a null pointer for ps.'
    141  1.1  riastrad 	 */
    142  1.1  riastrad 	if (ps == NULL)
    143  1.1  riastrad 		ps = &psbuf;
    144  1.1  riastrad 
    145  1.1  riastrad 	/*
    146  1.1  riastrad 	 * `If s is a null pointer, the c8rtomb function is equivalent
    147  1.1  riastrad 	 *  to the call
    148  1.1  riastrad 	 *
    149  1.1  riastrad 	 *	c8rtomb(buf, u8'\0', ps)
    150  1.1  riastrad 	 *
    151  1.1  riastrad 	 *  where buf is an internal buffer.
    152  1.1  riastrad 	 */
    153  1.1  riastrad 	if (s == NULL) {
    154  1.1  riastrad 		s = buf;
    155  1.1  riastrad 		c8 = 0;		/* XXX u8'\0' */
    156  1.1  riastrad 	}
    157  1.1  riastrad 
    158  1.1  riastrad 	/*
    159  1.1  riastrad 	 * Open the private UTF-8 decoding state.
    160  1.1  riastrad 	 */
    161  1.1  riastrad 	S = (struct c8rtombstate *)ps;
    162  1.1  riastrad 
    163  1.1  riastrad #if 0
    164  1.1  riastrad 	/*
    165  1.1  riastrad 	 * `If c8 is a null character, a null byte is stored, preceded
    166  1.1  riastrad 	 *  by any shift sequence needed to restore the initial shift
    167  1.1  riastrad 	 *  state; the resulting state described is the initial
    168  1.1  riastrad 	 *  conversion state.'
    169  1.1  riastrad 	 *
    170  1.1  riastrad 	 * XXX But what else gets stored?  Do we just discard any
    171  1.1  riastrad 	 * pending sequence, or do we convert it to something else, or
    172  1.1  riastrad 	 * what?
    173  1.1  riastrad 	 */
    174  1.1  riastrad 	if (c8 == u8'\0') {
    175  1.1  riastrad 		memset(S->buf, 0, sizeof(S->buf));
    176  1.1  riastrad 		S->n = 0;
    177  1.1  riastrad 	}
    178  1.1  riastrad #endif
    179  1.1  riastrad 
    180  1.1  riastrad 	/*
    181  1.1  riastrad 	 * Get the current state and buffer.
    182  1.1  riastrad 	 */
    183  1.1  riastrad 	__CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */
    184  1.1  riastrad 	state = __SHIFTOUT(S->state_c32, __BITS(31,24));
    185  1.1  riastrad 	c32 = __SHIFTOUT(S->state_c32, __BITS(23,0));
    186  1.1  riastrad 
    187  1.1  riastrad 	/*
    188  1.1  riastrad 	 * Feed the byte into the state machine to update the state.
    189  1.1  riastrad 	 */
    190  1.1  riastrad 	state = utf8_decode_step(state, c8, &c32);
    191  1.1  riastrad 	switch (state) {
    192  1.1  riastrad 	case UTF8_REJECT:
    193  1.1  riastrad 		/*
    194  1.1  riastrad 		 * Invalid UTF-8.  Fail with EILSEQ.
    195  1.1  riastrad 		 */
    196  1.1  riastrad 		errno = EILSEQ;
    197  1.1  riastrad 		return (size_t)-1;
    198  1.1  riastrad 	default:
    199  1.1  riastrad 		/*
    200  1.1  riastrad 		 * Valid UTF-8 so far but incomplete.  Update state and
    201  1.1  riastrad 		 * output nothing.
    202  1.1  riastrad 		 */
    203  1.1  riastrad 		S->state_c32 = __SHIFTIN(state, __BITS(31,24)) |
    204  1.1  riastrad 		    __SHIFTIN(c32, __BITS(23,0));
    205  1.1  riastrad 		return 0;
    206  1.1  riastrad 	case UTF8_ACCEPT:
    207  1.1  riastrad 		/*
    208  1.1  riastrad 		 * We have a scalar value.  Clear the state and output
    209  1.1  riastrad 		 * the scalar value.
    210  1.1  riastrad 		 */
    211  1.1  riastrad 		__CTASSERT(UTF8_ACCEPT == 0);
    212  1.1  riastrad 		S->state_c32 = 0;
    213  1.1  riastrad 		return c32rtomb(s, c32, &S->mbs);
    214  1.1  riastrad 	}
    215  1.1  riastrad }
    216