mbrtoc16.c revision 1.1 1 /* $NetBSD: mbrtoc16.c,v 1.1 2024/08/15 14:16:33 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2024 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * mbrtoc16(&c16, s, n, ps)
31 *
32 * Decode a Unicode scalar value from up to n bytes out of the
33 * multibyte string s, using multibyte encoding state ps, and
34 * store the next code unit in the UTF-16 representation of that
35 * scalar value at c16.
36 *
37 * If the next scalar value in s is outside the Basic Multilingual
38 * Plane, mbrtoc16 will yield the high surrogate code point in one
39 * call that consumes input, and will yield the low surrogate code
40 * point in the next call without consuming any input and
41 * returning (size_t)-3 instead.
42 *
43 * Return the number of bytes consumed on success, or:
44 *
45 * - 0 if the code unit is NUL, or
46 * - (size_t)-3 if the trailing low surrogate of a surrogate pair
47 * was returned without consuming any additional input, or
48 * - (size_t)-2 if the input is incomplete, or
49 * - (size_t)-1 on error with errno set to EILSEQ.
50 *
51 * In the case of incomplete input, the decoding state so far
52 * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so
53 * subsequent calls to mbrtoc16 will pick up n bytes later into
54 * the input stream.
55 *
56 * References:
57 *
58 * The Unicode Standard, Version 15.0 -- Core Specification, The
59 * Unicode Consortium, Sec. 3.8 `Surrogates', p. 119.
60 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
61 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
62 *
63 * The Unicode Standard, Version 15.0 -- Core Specification, The
64 * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
65 * p. 124.
66 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
67 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
68 *
69 * P. Hoffman and F. Yergeau, `UTF-16, an encoding of ISO 10646',
70 * RFC 2781, Internet Engineering Task Force, February 2000,
71 * Sec. 2.1: `Encoding UTF-16'.
72 * https://datatracker.ietf.org/doc/html/rfc2781#section-2.1
73 */
74
75 #include <sys/cdefs.h>
76 __RCSID("$NetBSD: mbrtoc16.c,v 1.1 2024/08/15 14:16:33 riastradh Exp $");
77
78 #include <assert.h>
79 #include <errno.h>
80 #include <stddef.h>
81 #include <uchar.h>
82
83 #include "mbrtoc32.h"
84
85 struct mbrtoc16state {
86 char16_t surrogate;
87 mbstate_t mbs;
88 };
89 __CTASSERT(offsetof(struct mbrtoc16state, mbs) <= sizeof(mbstate_t));
90 __CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t) -
91 offsetof(struct mbrtoc16state, mbs));
92 __CTASSERT(_Alignof(struct mbrtoc16state) <= _Alignof(mbstate_t));
93
94 size_t
95 mbrtoc16(char16_t *restrict pc16, const char *restrict s, size_t n,
96 mbstate_t *restrict ps)
97 {
98 static mbstate_t psbuf;
99 struct mbrtoc16state *S;
100 char32_t c32;
101 size_t len;
102
103 /*
104 * `If ps is a null pointer, each function uses its own
105 * internal mbstate_t object instead, which is initialized at
106 * program startup to the initial conversion state; the
107 * functions are not required to avoid data races with other
108 * calls to the same function in this case. The
109 * implementation behaves as if no library function calls
110 * these functions with a null pointer for ps.'
111 */
112 if (ps == NULL)
113 ps = &psbuf;
114
115 /*
116 * `If s is a null pointer, the mbrtoc16 function is equivalent
117 * to the call:
118 *
119 * mbrtoc16(NULL, "", 1, ps)
120 *
121 * In this case, the values of the parameters pc16 and n are
122 * ignored.'
123 */
124 if (s == NULL) {
125 pc16 = NULL;
126 s = "";
127 n = 1;
128 }
129
130 /*
131 * Get the private conversion state.
132 */
133 S = (struct mbrtoc16state *)ps;
134
135 /*
136 * If there is a pending surrogate, stash it and consume no
137 * bytes of the input, returning (size_t)-3 to indicate that no
138 * bytes of input were consumed.
139 */
140 if (S->surrogate >= 0xdc00 && S->surrogate <= 0xdfff) {
141 if (pc16)
142 *pc16 = S->surrogate;
143 S->surrogate = 0;
144 return (size_t)-3;
145 }
146
147 /*
148 * Consume the next scalar value. If no full scalar value can
149 * be obtained, stop here.
150 */
151 len = mbrtoc32(&c32, s, n, &S->mbs);
152 switch (len) {
153 case 0: /* NUL */
154 if (pc16)
155 *pc16 = 0;
156 return 0;
157 case (size_t)-2: /* still incomplete after n bytes */
158 case (size_t)-1: /* error */
159 return len;
160 default: /* consumed len bytes of input */
161 break;
162 }
163
164 /*
165 * We consumed a scalar value from the input.
166 *
167 * If it's inside the Basic Multilingual Plane (16-bit scalar
168 * values), return it.
169 *
170 * If it's outside the Basic Multilingual Plane, split it into
171 * high and low surrogate code points, return the high, and
172 * save the low.
173 */
174 if (c32 <= 0xffff) {
175 if (pc16)
176 *pc16 = c32;
177 _DIAGASSERT(S->surrogate == 0);
178 } else {
179 c32 -= 0x10000;
180 const char16_t w1 = 0xd800 | __SHIFTOUT(c32, __BITS(19,10));
181 const char16_t w2 = 0xdc00 | __SHIFTOUT(c32, __BITS(9,0));
182 if (pc16)
183 *pc16 = w1;
184 S->surrogate = w2;
185 _DIAGASSERT(S->surrogate != 0);
186 }
187
188 /*
189 * Return the number of bytes consumed from the input.
190 */
191 return len;
192 }
193