c8rtomb.c revision 1.9.6.2 1 1.9.6.2 perseant /* $NetBSD: c8rtomb.c,v 1.9.6.2 2025/08/02 05:54:38 perseant Exp $ */
2 1.9.6.2 perseant
3 1.9.6.2 perseant /*-
4 1.9.6.2 perseant * Copyright (c) 2024 The NetBSD Foundation, Inc.
5 1.9.6.2 perseant * All rights reserved.
6 1.9.6.2 perseant *
7 1.9.6.2 perseant * Redistribution and use in source and binary forms, with or without
8 1.9.6.2 perseant * modification, are permitted provided that the following conditions
9 1.9.6.2 perseant * are met:
10 1.9.6.2 perseant * 1. Redistributions of source code must retain the above copyright
11 1.9.6.2 perseant * notice, this list of conditions and the following disclaimer.
12 1.9.6.2 perseant * 2. Redistributions in binary form must reproduce the above copyright
13 1.9.6.2 perseant * notice, this list of conditions and the following disclaimer in the
14 1.9.6.2 perseant * documentation and/or other materials provided with the distribution.
15 1.9.6.2 perseant *
16 1.9.6.2 perseant * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.9.6.2 perseant * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.9.6.2 perseant * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.9.6.2 perseant * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.9.6.2 perseant * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.9.6.2 perseant * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.9.6.2 perseant * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.9.6.2 perseant * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.9.6.2 perseant * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.9.6.2 perseant * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.9.6.2 perseant * POSSIBILITY OF SUCH DAMAGE.
27 1.9.6.2 perseant */
28 1.9.6.2 perseant
29 1.9.6.2 perseant /*
30 1.9.6.2 perseant * c8rtomb(s, c8, ps)
31 1.9.6.2 perseant *
32 1.9.6.2 perseant * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
33 1.9.6.2 perseant * s under the current locale, using multibyte encoding state ps.
34 1.9.6.2 perseant *
35 1.9.6.2 perseant * If c8 is not the last byte of a UTF-8 scalar value sequence, no
36 1.9.6.2 perseant * output will be produced, but c8 will be remembered; this must
37 1.9.6.2 perseant * be followed by another call passing the following bytes.
38 1.9.6.2 perseant *
39 1.9.6.2 perseant * Return the number of bytes stored on success, or (size_t)-1 on
40 1.9.6.2 perseant * error with errno set to EILSEQ.
41 1.9.6.2 perseant *
42 1.9.6.2 perseant * At most MB_CUR_MAX bytes will be stored.
43 1.9.6.2 perseant *
44 1.9.6.2 perseant * References:
45 1.9.6.2 perseant *
46 1.9.6.2 perseant * The Unicode Standard, Version 15.0 -- Core Specification, The
47 1.9.6.2 perseant * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
48 1.9.6.2 perseant * p. 124.
49 1.9.6.2 perseant * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
50 1.9.6.2 perseant * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
51 1.9.6.2 perseant *
52 1.9.6.2 perseant * F. Yergeau, `UTF-8, a transformation format of ISO 10646',
53 1.9.6.2 perseant * RFC 3629, Internet Engineering Task Force, November 2003.
54 1.9.6.2 perseant * https://datatracker.ietf.org/doc/html/rfc3629
55 1.9.6.2 perseant */
56 1.9.6.2 perseant
57 1.9.6.2 perseant #include <sys/cdefs.h>
58 1.9.6.2 perseant __RCSID("$NetBSD: c8rtomb.c,v 1.9.6.2 2025/08/02 05:54:38 perseant Exp $");
59 1.9.6.2 perseant
60 1.9.6.2 perseant #include "namespace.h"
61 1.9.6.2 perseant
62 1.9.6.2 perseant #include <assert.h>
63 1.9.6.2 perseant #include <errno.h>
64 1.9.6.2 perseant #include <limits.h>
65 1.9.6.2 perseant #include <locale.h>
66 1.9.6.2 perseant #include <stdalign.h>
67 1.9.6.2 perseant #include <stddef.h>
68 1.9.6.2 perseant #include <stdint.h>
69 1.9.6.2 perseant #include <uchar.h>
70 1.9.6.2 perseant
71 1.9.6.2 perseant #include "c32rtomb.h"
72 1.9.6.2 perseant #include "setlocale_local.h"
73 1.9.6.2 perseant
74 1.9.6.2 perseant struct c8rtombstate {
75 1.9.6.2 perseant char32_t state_c32; /* 8-bit state and 24-bit buffer */
76 1.9.6.2 perseant mbstate_t mbs;
77 1.9.6.2 perseant };
78 1.9.6.2 perseant __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t));
79 1.9.6.2 perseant __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) -
80 1.9.6.2 perseant offsetof(struct c8rtombstate, mbs));
81 1.9.6.2 perseant __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t));
82 1.9.6.2 perseant
83 1.9.6.2 perseant /*
84 1.9.6.2 perseant * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
85 1.9.6.2 perseant * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
86 1.9.6.2 perseant * from scratch.
87 1.9.6.2 perseant */
88 1.9.6.2 perseant
89 1.9.6.2 perseant #define UTF8_ACCEPT 0
90 1.9.6.2 perseant #define UTF8_REJECT 96
91 1.9.6.2 perseant
92 1.9.6.2 perseant typedef uint8_t utf8_class_t;
93 1.9.6.2 perseant typedef uint8_t utf8_state_t;
94 1.9.6.2 perseant
95 1.9.6.2 perseant static const uint8_t utf8_classtab[] = {
96 1.9.6.2 perseant 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
97 1.9.6.2 perseant 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
98 1.9.6.2 perseant 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
99 1.9.6.2 perseant 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
100 1.9.6.2 perseant 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
101 1.9.6.2 perseant 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
102 1.9.6.2 perseant 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
103 1.9.6.2 perseant 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
104 1.9.6.2 perseant };
105 1.9.6.2 perseant
106 1.9.6.2 perseant static const uint8_t utf8_statetab[] = {
107 1.9.6.2 perseant 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96,
108 1.9.6.2 perseant 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96,
109 1.9.6.2 perseant 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96,
110 1.9.6.2 perseant 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96,
111 1.9.6.2 perseant 96,96,96,96,96,96,96,96,96,96,96,96,
112 1.9.6.2 perseant };
113 1.9.6.2 perseant
114 1.9.6.2 perseant static utf8_state_t
115 1.9.6.2 perseant utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
116 1.9.6.2 perseant {
117 1.9.6.2 perseant const utf8_class_t class = utf8_classtab[c8];
118 1.9.6.2 perseant
119 1.9.6.2 perseant *pc32 = (state == UTF8_ACCEPT
120 1.9.6.2 perseant ? (c8 & (0xff >> class))
121 1.9.6.2 perseant : ((c8 & 0x3f) | (*pc32 << 6)));
122 1.9.6.2 perseant
123 1.9.6.2 perseant return utf8_statetab[state + class];
124 1.9.6.2 perseant }
125 1.9.6.2 perseant
126 1.9.6.2 perseant #ifdef __weak_alias
127 1.9.6.2 perseant __weak_alias(c8rtomb_l,_c8rtomb_l)
128 1.9.6.2 perseant #endif
129 1.9.6.2 perseant
130 1.9.6.2 perseant size_t
131 1.9.6.2 perseant c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
132 1.9.6.2 perseant {
133 1.9.6.2 perseant
134 1.9.6.2 perseant return c8rtomb_l(s, c8, ps, _current_locale());
135 1.9.6.2 perseant }
136 1.9.6.2 perseant
137 1.9.6.2 perseant size_t
138 1.9.6.2 perseant c8rtomb_l(char *restrict s, char8_t c8, mbstate_t *restrict ps, locale_t loc)
139 1.9.6.2 perseant {
140 1.9.6.2 perseant static mbstate_t psbuf;
141 1.9.6.2 perseant char buf[MB_LEN_MAX];
142 1.9.6.2 perseant struct c8rtombstate *S;
143 1.9.6.2 perseant utf8_state_t state;
144 1.9.6.2 perseant char32_t c32;
145 1.9.6.2 perseant
146 1.9.6.2 perseant /*
147 1.9.6.2 perseant * `If ps is a null pointer, each function uses its own
148 1.9.6.2 perseant * internal mbstate_t object instead, which is initialized at
149 1.9.6.2 perseant * program startup to the initial conversion state; the
150 1.9.6.2 perseant * functions are not required to avoid data races with other
151 1.9.6.2 perseant * calls to the same function in this case. The
152 1.9.6.2 perseant * implementation behaves as if no library function calls
153 1.9.6.2 perseant * these functions with a null pointer for ps.'
154 1.9.6.2 perseant */
155 1.9.6.2 perseant if (ps == NULL)
156 1.9.6.2 perseant ps = &psbuf;
157 1.9.6.2 perseant
158 1.9.6.2 perseant /*
159 1.9.6.2 perseant * `If s is a null pointer, the c8rtomb function is equivalent
160 1.9.6.2 perseant * to the call
161 1.9.6.2 perseant *
162 1.9.6.2 perseant * c8rtomb(buf, u8'\0', ps)
163 1.9.6.2 perseant *
164 1.9.6.2 perseant * where buf is an internal buffer.
165 1.9.6.2 perseant */
166 1.9.6.2 perseant if (s == NULL) {
167 1.9.6.2 perseant s = buf;
168 1.9.6.2 perseant c8 = 0; /* XXX u8'\0' */
169 1.9.6.2 perseant }
170 1.9.6.2 perseant
171 1.9.6.2 perseant /*
172 1.9.6.2 perseant * Open the private UTF-8 decoding state.
173 1.9.6.2 perseant */
174 1.9.6.2 perseant S = (struct c8rtombstate *)(void *)ps;
175 1.9.6.2 perseant
176 1.9.6.2 perseant /*
177 1.9.6.2 perseant * `If c8 is a null character, a null byte is stored, preceded
178 1.9.6.2 perseant * by any shift sequence needed to restore the initial shift
179 1.9.6.2 perseant * state; the resulting state described is the initial
180 1.9.6.2 perseant * conversion state.'
181 1.9.6.2 perseant *
182 1.9.6.2 perseant * So if c8 is null, discard any buffered input -- there's
183 1.9.6.2 perseant * nothing we can legitimately do with it -- and convert a null
184 1.9.6.2 perseant * scalar value, which by definition of c32rtomb writes out any
185 1.9.6.2 perseant * shift sequence reset followed by a null byte.
186 1.9.6.2 perseant */
187 1.9.6.2 perseant if (c8 == '\0') {
188 1.9.6.2 perseant c32 = 0;
189 1.9.6.2 perseant goto accept;
190 1.9.6.2 perseant }
191 1.9.6.2 perseant
192 1.9.6.2 perseant /*
193 1.9.6.2 perseant * Get the current state and buffer.
194 1.9.6.2 perseant */
195 1.9.6.2 perseant __CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */
196 1.9.6.2 perseant state = __SHIFTOUT(S->state_c32, __BITS(31,24));
197 1.9.6.2 perseant c32 = __SHIFTOUT(S->state_c32, __BITS(23,0));
198 1.9.6.2 perseant
199 1.9.6.2 perseant /*
200 1.9.6.2 perseant * Feed the byte into the state machine to update the state.
201 1.9.6.2 perseant */
202 1.9.6.2 perseant state = utf8_decode_step(state, c8, &c32);
203 1.9.6.2 perseant switch (state) {
204 1.9.6.2 perseant case UTF8_REJECT:
205 1.9.6.2 perseant /*
206 1.9.6.2 perseant * Invalid UTF-8. Fail with EILSEQ.
207 1.9.6.2 perseant */
208 1.9.6.2 perseant errno = EILSEQ;
209 1.9.6.2 perseant return (size_t)-1;
210 1.9.6.2 perseant default:
211 1.9.6.2 perseant /*
212 1.9.6.2 perseant * Valid UTF-8 so far but incomplete. Update state and
213 1.9.6.2 perseant * output nothing.
214 1.9.6.2 perseant */
215 1.9.6.2 perseant S->state_c32 =
216 1.9.6.2 perseant __SHIFTIN(state, __BITS(31,24)) |
217 1.9.6.2 perseant __SHIFTIN(c32, __BITS(23,0));
218 1.9.6.2 perseant return 0;
219 1.9.6.2 perseant case UTF8_ACCEPT:
220 1.9.6.2 perseant accept:
221 1.9.6.2 perseant /*
222 1.9.6.2 perseant * We have a scalar value. Clear the state and output
223 1.9.6.2 perseant * the scalar value.
224 1.9.6.2 perseant */
225 1.9.6.2 perseant S->state_c32 = 0;
226 1.9.6.2 perseant return c32rtomb_l(s, c32, &S->mbs, loc);
227 1.9.6.2 perseant }
228 1.9.6.2 perseant }
229