mbrtoc16.c revision 1.3 1 /* $NetBSD: mbrtoc16.c,v 1.3 2024/08/15 20:23:26 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2024 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * mbrtoc16(&c16, s, n, ps)
31 *
32 * Decode a Unicode scalar value from up to n bytes out of the
33 * multibyte string s, using multibyte encoding state ps, and
34 * store the next code unit in the UTF-16 representation of that
35 * scalar value at c16.
36 *
37 * If the next scalar value in s is outside the Basic Multilingual
38 * Plane, mbrtoc16 will yield the high surrogate code point in one
39 * call that consumes input, and will yield the low surrogate code
40 * point in the next call without consuming any input and
41 * returning (size_t)-3 instead.
42 *
43 * Return the number of bytes consumed on success, or:
44 *
45 * - 0 if the code unit is NUL, or
46 * - (size_t)-3 if the trailing low surrogate of a surrogate pair
47 * was returned without consuming any additional input, or
48 * - (size_t)-2 if the input is incomplete, or
49 * - (size_t)-1 on error with errno set to EILSEQ.
50 *
51 * In the case of incomplete input, the decoding state so far
52 * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so
53 * subsequent calls to mbrtoc16 will pick up n bytes later into
54 * the input stream.
55 *
56 * References:
57 *
58 * The Unicode Standard, Version 15.0 -- Core Specification, The
59 * Unicode Consortium, Sec. 3.8 `Surrogates', p. 119.
60 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
61 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
62 *
63 * The Unicode Standard, Version 15.0 -- Core Specification, The
64 * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
65 * p. 124.
66 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
67 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
68 *
69 * P. Hoffman and F. Yergeau, `UTF-16, an encoding of ISO 10646',
70 * RFC 2781, Internet Engineering Task Force, February 2000,
71 * Sec. 2.1: `Encoding UTF-16'.
72 * https://datatracker.ietf.org/doc/html/rfc2781#section-2.1
73 */
74
75 #include <sys/cdefs.h>
76 __RCSID("$NetBSD: mbrtoc16.c,v 1.3 2024/08/15 20:23:26 riastradh Exp $");
77
78 #include <assert.h>
79 #include <errno.h>
80 #include <stdalign.h>
81 #include <stddef.h>
82 #include <uchar.h>
83
84 #include "mbrtoc32.h"
85
86 struct mbrtoc16state {
87 char16_t surrogate;
88 mbstate_t mbs;
89 };
90 __CTASSERT(offsetof(struct mbrtoc16state, mbs) <= sizeof(mbstate_t));
91 __CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t) -
92 offsetof(struct mbrtoc16state, mbs));
93 __CTASSERT(alignof(struct mbrtoc16state) <= alignof(mbstate_t));
94
95 size_t
96 mbrtoc16(char16_t *restrict pc16, const char *restrict s, size_t n,
97 mbstate_t *restrict ps)
98 {
99 static mbstate_t psbuf;
100 struct mbrtoc16state *S;
101 char32_t c32;
102 size_t len;
103
104 /*
105 * `If ps is a null pointer, each function uses its own
106 * internal mbstate_t object instead, which is initialized at
107 * program startup to the initial conversion state; the
108 * functions are not required to avoid data races with other
109 * calls to the same function in this case. The
110 * implementation behaves as if no library function calls
111 * these functions with a null pointer for ps.'
112 */
113 if (ps == NULL)
114 ps = &psbuf;
115
116 /*
117 * `If s is a null pointer, the mbrtoc16 function is equivalent
118 * to the call:
119 *
120 * mbrtoc16(NULL, "", 1, ps)
121 *
122 * In this case, the values of the parameters pc16 and n are
123 * ignored.'
124 */
125 if (s == NULL) {
126 pc16 = NULL;
127 s = "";
128 n = 1;
129 }
130
131 /*
132 * Get the private conversion state.
133 */
134 S = (struct mbrtoc16state *)ps;
135
136 /*
137 * If there is a pending surrogate, yield it and consume no
138 * bytes of the input, returning (size_t)-3 to indicate that no
139 * bytes of input were consumed.
140 */
141 if (S->surrogate != 0) {
142 _DIAGASSERT(S->surrogate >= 0xdc00);
143 _DIAGASSERT(S->surrogate <= 0xdfff);
144 if (pc16)
145 *pc16 = S->surrogate;
146 S->surrogate = 0;
147 return (size_t)-3;
148 }
149
150 /*
151 * Consume the next scalar value. If no full scalar value can
152 * be obtained, stop here.
153 */
154 len = mbrtoc32(&c32, s, n, &S->mbs);
155 switch (len) {
156 case 0: /* NUL */
157 if (pc16)
158 *pc16 = 0;
159 return 0;
160 case (size_t)-2: /* still incomplete after n bytes */
161 case (size_t)-1: /* error */
162 return len;
163 default: /* consumed len bytes of input */
164 break;
165 }
166
167 /*
168 * We consumed a scalar value from the input.
169 *
170 * If it's inside the Basic Multilingual Plane (16-bit scalar
171 * values), return it.
172 *
173 * If it's outside the Basic Multilingual Plane, split it into
174 * high and low surrogate code points, return the high, and
175 * save the low.
176 */
177 if (c32 <= 0xffff) {
178 if (pc16)
179 *pc16 = c32;
180 _DIAGASSERT(S->surrogate == 0);
181 } else {
182 c32 -= 0x10000;
183 const char16_t w1 = 0xd800 | __SHIFTOUT(c32, __BITS(19,10));
184 const char16_t w2 = 0xdc00 | __SHIFTOUT(c32, __BITS(9,0));
185 if (pc16)
186 *pc16 = w1;
187 S->surrogate = w2;
188 _DIAGASSERT(S->surrogate != 0);
189 _DIAGASSERT(S->surrogate >= 0xdc00);
190 _DIAGASSERT(S->surrogate <= 0xdfff);
191 }
192
193 /*
194 * Return the number of bytes consumed from the input.
195 */
196 return len;
197 }
198