1 1.9 rillig /* $NetBSD: c8rtomb.c,v 1.9 2024/10/12 16:44:44 rillig Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad /* 30 1.1 riastrad * c8rtomb(s, c8, ps) 31 1.1 riastrad * 32 1.1 riastrad * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer 33 1.1 riastrad * s under the current locale, using multibyte encoding state ps. 34 1.1 riastrad * 35 1.1 riastrad * If c8 is not the last byte of a UTF-8 scalar value sequence, no 36 1.1 riastrad * output will be produced, but c8 will be remembered; this must 37 1.1 riastrad * be followed by another call passing the following bytes. 38 1.1 riastrad * 39 1.1 riastrad * Return the number of bytes stored on success, or (size_t)-1 on 40 1.1 riastrad * error with errno set to EILSEQ. 41 1.1 riastrad * 42 1.1 riastrad * At most MB_CUR_MAX bytes will be stored. 43 1.1 riastrad * 44 1.1 riastrad * References: 45 1.1 riastrad * 46 1.1 riastrad * The Unicode Standard, Version 15.0 -- Core Specification, The 47 1.1 riastrad * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8, 48 1.1 riastrad * p. 124. 49 1.1 riastrad * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 50 1.1 riastrad * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 51 1.1 riastrad * 52 1.1 riastrad * F. Yergeau, `UTF-8, a transformation format of ISO 10646', 53 1.1 riastrad * RFC 3629, Internet Engineering Task Force, November 2003. 54 1.1 riastrad * https://datatracker.ietf.org/doc/html/rfc3629 55 1.1 riastrad */ 56 1.1 riastrad 57 1.1 riastrad #include <sys/cdefs.h> 58 1.9 rillig __RCSID("$NetBSD: c8rtomb.c,v 1.9 2024/10/12 16:44:44 rillig Exp $"); 59 1.2 riastrad 60 1.2 riastrad #include "namespace.h" 61 1.1 riastrad 62 1.1 riastrad #include <assert.h> 63 1.1 riastrad #include <errno.h> 64 1.1 riastrad #include <limits.h> 65 1.4 riastrad #include <locale.h> 66 1.1 riastrad #include <stdalign.h> 67 1.1 riastrad #include <stddef.h> 68 1.1 riastrad #include <stdint.h> 69 1.1 riastrad #include <uchar.h> 70 1.1 riastrad 71 1.1 riastrad #include "c32rtomb.h" 72 1.4 riastrad #include "setlocale_local.h" 73 1.1 riastrad 74 1.1 riastrad struct c8rtombstate { 75 1.1 riastrad char32_t state_c32; /* 8-bit state and 24-bit buffer */ 76 1.1 riastrad mbstate_t mbs; 77 1.1 riastrad }; 78 1.1 riastrad __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t)); 79 1.1 riastrad __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) - 80 1.1 riastrad offsetof(struct c8rtombstate, mbs)); 81 1.1 riastrad __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t)); 82 1.1 riastrad 83 1.1 riastrad /* 84 1.1 riastrad * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at 85 1.1 riastrad * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented 86 1.1 riastrad * from scratch. 87 1.1 riastrad */ 88 1.1 riastrad 89 1.1 riastrad #define UTF8_ACCEPT 0 90 1.1 riastrad #define UTF8_REJECT 96 91 1.1 riastrad 92 1.9 rillig typedef uint8_t utf8_class_t; 93 1.9 rillig typedef uint8_t utf8_state_t; 94 1.1 riastrad 95 1.8 rillig static const uint8_t utf8_classtab[] = { 96 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 97 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 98 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 99 1.1 riastrad 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 100 1.1 riastrad 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 101 1.1 riastrad 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 102 1.1 riastrad 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 103 1.1 riastrad 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 104 1.1 riastrad }; 105 1.1 riastrad 106 1.8 rillig static const uint8_t utf8_statetab[] = { 107 1.1 riastrad 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96, 108 1.1 riastrad 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96, 109 1.1 riastrad 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96, 110 1.1 riastrad 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96, 111 1.1 riastrad 96,96,96,96,96,96,96,96,96,96,96,96, 112 1.1 riastrad }; 113 1.1 riastrad 114 1.1 riastrad static utf8_state_t 115 1.1 riastrad utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32) 116 1.1 riastrad { 117 1.1 riastrad const utf8_class_t class = utf8_classtab[c8]; 118 1.1 riastrad 119 1.1 riastrad *pc32 = (state == UTF8_ACCEPT 120 1.1 riastrad ? (c8 & (0xff >> class)) 121 1.1 riastrad : ((c8 & 0x3f) | (*pc32 << 6))); 122 1.1 riastrad 123 1.1 riastrad return utf8_statetab[state + class]; 124 1.1 riastrad } 125 1.1 riastrad 126 1.4 riastrad #ifdef __weak_alias 127 1.4 riastrad __weak_alias(c8rtomb_l,_c8rtomb_l) 128 1.4 riastrad #endif 129 1.4 riastrad 130 1.1 riastrad size_t 131 1.1 riastrad c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps) 132 1.1 riastrad { 133 1.4 riastrad 134 1.4 riastrad return c8rtomb_l(s, c8, ps, _current_locale()); 135 1.4 riastrad } 136 1.4 riastrad 137 1.4 riastrad size_t 138 1.4 riastrad c8rtomb_l(char *restrict s, char8_t c8, mbstate_t *restrict ps, locale_t loc) 139 1.4 riastrad { 140 1.1 riastrad static mbstate_t psbuf; 141 1.1 riastrad char buf[MB_LEN_MAX]; 142 1.1 riastrad struct c8rtombstate *S; 143 1.1 riastrad utf8_state_t state; 144 1.1 riastrad char32_t c32; 145 1.1 riastrad 146 1.1 riastrad /* 147 1.1 riastrad * `If ps is a null pointer, each function uses its own 148 1.1 riastrad * internal mbstate_t object instead, which is initialized at 149 1.1 riastrad * program startup to the initial conversion state; the 150 1.1 riastrad * functions are not required to avoid data races with other 151 1.1 riastrad * calls to the same function in this case. The 152 1.1 riastrad * implementation behaves as if no library function calls 153 1.1 riastrad * these functions with a null pointer for ps.' 154 1.1 riastrad */ 155 1.1 riastrad if (ps == NULL) 156 1.1 riastrad ps = &psbuf; 157 1.1 riastrad 158 1.1 riastrad /* 159 1.1 riastrad * `If s is a null pointer, the c8rtomb function is equivalent 160 1.1 riastrad * to the call 161 1.1 riastrad * 162 1.1 riastrad * c8rtomb(buf, u8'\0', ps) 163 1.1 riastrad * 164 1.1 riastrad * where buf is an internal buffer. 165 1.1 riastrad */ 166 1.1 riastrad if (s == NULL) { 167 1.1 riastrad s = buf; 168 1.1 riastrad c8 = 0; /* XXX u8'\0' */ 169 1.1 riastrad } 170 1.1 riastrad 171 1.1 riastrad /* 172 1.1 riastrad * Open the private UTF-8 decoding state. 173 1.1 riastrad */ 174 1.3 christos S = (struct c8rtombstate *)(void *)ps; 175 1.1 riastrad 176 1.1 riastrad /* 177 1.1 riastrad * `If c8 is a null character, a null byte is stored, preceded 178 1.1 riastrad * by any shift sequence needed to restore the initial shift 179 1.1 riastrad * state; the resulting state described is the initial 180 1.1 riastrad * conversion state.' 181 1.1 riastrad * 182 1.5 riastrad * So if c8 is null, discard any buffered input -- there's 183 1.5 riastrad * nothing we can legitimately do with it -- and convert a null 184 1.5 riastrad * scalar value, which by definition of c32rtomb writes out any 185 1.5 riastrad * shift sequence reset followed by a null byte. 186 1.5 riastrad */ 187 1.5 riastrad if (c8 == '\0') { 188 1.5 riastrad c32 = 0; 189 1.5 riastrad goto accept; 190 1.1 riastrad } 191 1.1 riastrad 192 1.1 riastrad /* 193 1.1 riastrad * Get the current state and buffer. 194 1.1 riastrad */ 195 1.1 riastrad __CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */ 196 1.7 riastrad state = __SHIFTOUT(S->state_c32, __BITS(31,24)); 197 1.7 riastrad c32 = __SHIFTOUT(S->state_c32, __BITS(23,0)); 198 1.1 riastrad 199 1.1 riastrad /* 200 1.1 riastrad * Feed the byte into the state machine to update the state. 201 1.1 riastrad */ 202 1.1 riastrad state = utf8_decode_step(state, c8, &c32); 203 1.1 riastrad switch (state) { 204 1.1 riastrad case UTF8_REJECT: 205 1.1 riastrad /* 206 1.1 riastrad * Invalid UTF-8. Fail with EILSEQ. 207 1.1 riastrad */ 208 1.1 riastrad errno = EILSEQ; 209 1.1 riastrad return (size_t)-1; 210 1.1 riastrad default: 211 1.1 riastrad /* 212 1.1 riastrad * Valid UTF-8 so far but incomplete. Update state and 213 1.1 riastrad * output nothing. 214 1.1 riastrad */ 215 1.9 rillig S->state_c32 = 216 1.3 christos __SHIFTIN(state, __BITS(31,24)) | 217 1.9 rillig __SHIFTIN(c32, __BITS(23,0)); 218 1.1 riastrad return 0; 219 1.1 riastrad case UTF8_ACCEPT: 220 1.5 riastrad accept: 221 1.1 riastrad /* 222 1.1 riastrad * We have a scalar value. Clear the state and output 223 1.1 riastrad * the scalar value. 224 1.1 riastrad */ 225 1.1 riastrad S->state_c32 = 0; 226 1.4 riastrad return c32rtomb_l(s, c32, &S->mbs, loc); 227 1.1 riastrad } 228 1.1 riastrad } 229