c8rtomb.c revision 1.2 1 1.2 riastrad /* $NetBSD: c8rtomb.c,v 1.2 2024/08/15 22:23:17 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2024 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad /*
30 1.1 riastrad * c8rtomb(s, c8, ps)
31 1.1 riastrad *
32 1.1 riastrad * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer
33 1.1 riastrad * s under the current locale, using multibyte encoding state ps.
34 1.1 riastrad *
35 1.1 riastrad * If c8 is not the last byte of a UTF-8 scalar value sequence, no
36 1.1 riastrad * output will be produced, but c8 will be remembered; this must
37 1.1 riastrad * be followed by another call passing the following bytes.
38 1.1 riastrad *
39 1.1 riastrad * Return the number of bytes stored on success, or (size_t)-1 on
40 1.1 riastrad * error with errno set to EILSEQ.
41 1.1 riastrad *
42 1.1 riastrad * At most MB_CUR_MAX bytes will be stored.
43 1.1 riastrad *
44 1.1 riastrad * References:
45 1.1 riastrad *
46 1.1 riastrad * The Unicode Standard, Version 15.0 -- Core Specification, The
47 1.1 riastrad * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8,
48 1.1 riastrad * p. 124.
49 1.1 riastrad * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
50 1.1 riastrad * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
51 1.1 riastrad *
52 1.1 riastrad * F. Yergeau, `UTF-8, a transformation format of ISO 10646',
53 1.1 riastrad * RFC 3629, Internet Engineering Task Force, November 2003.
54 1.1 riastrad * https://datatracker.ietf.org/doc/html/rfc3629
55 1.1 riastrad */
56 1.1 riastrad
57 1.1 riastrad #include <sys/cdefs.h>
58 1.2 riastrad __RCSID("$NetBSD: c8rtomb.c,v 1.2 2024/08/15 22:23:17 riastradh Exp $");
59 1.2 riastrad
60 1.2 riastrad #include "namespace.h"
61 1.1 riastrad
62 1.1 riastrad #include <assert.h>
63 1.1 riastrad #include <errno.h>
64 1.1 riastrad #include <limits.h>
65 1.1 riastrad #include <stdalign.h>
66 1.1 riastrad #include <stddef.h>
67 1.1 riastrad #include <stdint.h>
68 1.1 riastrad #include <uchar.h>
69 1.1 riastrad
70 1.1 riastrad #include "c32rtomb.h"
71 1.1 riastrad
72 1.1 riastrad struct c8rtombstate {
73 1.1 riastrad char32_t state_c32; /* 8-bit state and 24-bit buffer */
74 1.1 riastrad mbstate_t mbs;
75 1.1 riastrad };
76 1.1 riastrad __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t));
77 1.1 riastrad __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) -
78 1.1 riastrad offsetof(struct c8rtombstate, mbs));
79 1.1 riastrad __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t));
80 1.1 riastrad
81 1.1 riastrad /*
82 1.1 riastrad * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at
83 1.1 riastrad * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented
84 1.1 riastrad * from scratch.
85 1.1 riastrad */
86 1.1 riastrad
87 1.1 riastrad #define UTF8_ACCEPT 0
88 1.1 riastrad #define UTF8_REJECT 96
89 1.1 riastrad
90 1.1 riastrad typedef uint_fast8_t utf8_class_t;
91 1.1 riastrad typedef uint_fast8_t utf8_state_t;
92 1.1 riastrad
93 1.1 riastrad static uint8_t utf8_classtab[] = {
94 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
95 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
96 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
97 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
98 1.1 riastrad 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
99 1.1 riastrad 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
100 1.1 riastrad 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
101 1.1 riastrad 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
102 1.1 riastrad };
103 1.1 riastrad
104 1.1 riastrad static uint8_t utf8_statetab[] = {
105 1.1 riastrad 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96,
106 1.1 riastrad 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96,
107 1.1 riastrad 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96,
108 1.1 riastrad 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96,
109 1.1 riastrad 96,96,96,96,96,96,96,96,96,96,96,96,
110 1.1 riastrad };
111 1.1 riastrad
112 1.1 riastrad static utf8_state_t
113 1.1 riastrad utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32)
114 1.1 riastrad {
115 1.1 riastrad const utf8_class_t class = utf8_classtab[c8];
116 1.1 riastrad
117 1.1 riastrad *pc32 = (state == UTF8_ACCEPT
118 1.1 riastrad ? (c8 & (0xff >> class))
119 1.1 riastrad : ((c8 & 0x3f) | (*pc32 << 6)));
120 1.1 riastrad
121 1.1 riastrad return utf8_statetab[state + class];
122 1.1 riastrad }
123 1.1 riastrad
124 1.1 riastrad size_t
125 1.1 riastrad c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps)
126 1.1 riastrad {
127 1.1 riastrad static mbstate_t psbuf;
128 1.1 riastrad char buf[MB_LEN_MAX];
129 1.1 riastrad struct c8rtombstate *S;
130 1.1 riastrad utf8_state_t state;
131 1.1 riastrad char32_t c32;
132 1.1 riastrad
133 1.1 riastrad /*
134 1.1 riastrad * `If ps is a null pointer, each function uses its own
135 1.1 riastrad * internal mbstate_t object instead, which is initialized at
136 1.1 riastrad * program startup to the initial conversion state; the
137 1.1 riastrad * functions are not required to avoid data races with other
138 1.1 riastrad * calls to the same function in this case. The
139 1.1 riastrad * implementation behaves as if no library function calls
140 1.1 riastrad * these functions with a null pointer for ps.'
141 1.1 riastrad */
142 1.1 riastrad if (ps == NULL)
143 1.1 riastrad ps = &psbuf;
144 1.1 riastrad
145 1.1 riastrad /*
146 1.1 riastrad * `If s is a null pointer, the c8rtomb function is equivalent
147 1.1 riastrad * to the call
148 1.1 riastrad *
149 1.1 riastrad * c8rtomb(buf, u8'\0', ps)
150 1.1 riastrad *
151 1.1 riastrad * where buf is an internal buffer.
152 1.1 riastrad */
153 1.1 riastrad if (s == NULL) {
154 1.1 riastrad s = buf;
155 1.1 riastrad c8 = 0; /* XXX u8'\0' */
156 1.1 riastrad }
157 1.1 riastrad
158 1.1 riastrad /*
159 1.1 riastrad * Open the private UTF-8 decoding state.
160 1.1 riastrad */
161 1.1 riastrad S = (struct c8rtombstate *)ps;
162 1.1 riastrad
163 1.1 riastrad #if 0
164 1.1 riastrad /*
165 1.1 riastrad * `If c8 is a null character, a null byte is stored, preceded
166 1.1 riastrad * by any shift sequence needed to restore the initial shift
167 1.1 riastrad * state; the resulting state described is the initial
168 1.1 riastrad * conversion state.'
169 1.1 riastrad *
170 1.1 riastrad * XXX But what else gets stored? Do we just discard any
171 1.1 riastrad * pending sequence, or do we convert it to something else, or
172 1.1 riastrad * what?
173 1.1 riastrad */
174 1.1 riastrad if (c8 == u8'\0') {
175 1.1 riastrad memset(S->buf, 0, sizeof(S->buf));
176 1.1 riastrad S->n = 0;
177 1.1 riastrad }
178 1.1 riastrad #endif
179 1.1 riastrad
180 1.1 riastrad /*
181 1.1 riastrad * Get the current state and buffer.
182 1.1 riastrad */
183 1.1 riastrad __CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */
184 1.1 riastrad state = __SHIFTOUT(S->state_c32, __BITS(31,24));
185 1.1 riastrad c32 = __SHIFTOUT(S->state_c32, __BITS(23,0));
186 1.1 riastrad
187 1.1 riastrad /*
188 1.1 riastrad * Feed the byte into the state machine to update the state.
189 1.1 riastrad */
190 1.1 riastrad state = utf8_decode_step(state, c8, &c32);
191 1.1 riastrad switch (state) {
192 1.1 riastrad case UTF8_REJECT:
193 1.1 riastrad /*
194 1.1 riastrad * Invalid UTF-8. Fail with EILSEQ.
195 1.1 riastrad */
196 1.1 riastrad errno = EILSEQ;
197 1.1 riastrad return (size_t)-1;
198 1.1 riastrad default:
199 1.1 riastrad /*
200 1.1 riastrad * Valid UTF-8 so far but incomplete. Update state and
201 1.1 riastrad * output nothing.
202 1.1 riastrad */
203 1.1 riastrad S->state_c32 = __SHIFTIN(state, __BITS(31,24)) |
204 1.1 riastrad __SHIFTIN(c32, __BITS(23,0));
205 1.1 riastrad return 0;
206 1.1 riastrad case UTF8_ACCEPT:
207 1.1 riastrad /*
208 1.1 riastrad * We have a scalar value. Clear the state and output
209 1.1 riastrad * the scalar value.
210 1.1 riastrad */
211 1.1 riastrad __CTASSERT(UTF8_ACCEPT == 0);
212 1.1 riastrad S->state_c32 = 0;
213 1.1 riastrad return c32rtomb(s, c32, &S->mbs);
214 1.1 riastrad }
215 1.1 riastrad }
216