1 1.6 rillig /* $NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /*- 4 1.1 riastrad * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 1.1 riastrad * All rights reserved. 6 1.1 riastrad * 7 1.1 riastrad * Redistribution and use in source and binary forms, with or without 8 1.1 riastrad * modification, are permitted provided that the following conditions 9 1.1 riastrad * are met: 10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright 11 1.1 riastrad * notice, this list of conditions and the following disclaimer. 12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the 14 1.1 riastrad * documentation and/or other materials provided with the distribution. 15 1.1 riastrad * 16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE. 27 1.1 riastrad */ 28 1.1 riastrad 29 1.1 riastrad /* 30 1.1 riastrad * c32rtomb(s, c32, ps) 31 1.1 riastrad * 32 1.1 riastrad * Encode the Unicode UTF-32 code unit c32, which must not be a 33 1.1 riastrad * surrogate code point, into the multibyte buffer s under the 34 1.1 riastrad * current locale, using multibyte encoding state ps. A UTF-32 35 1.1 riastrad * code unit is also a Unicode scalar value, which is any Unicode 36 1.1 riastrad * code point except a surrogate. 37 1.1 riastrad * 38 1.1 riastrad * Return the number of bytes stored on success, or (size_t)-1 on 39 1.1 riastrad * error with errno set to EILSEQ. 40 1.1 riastrad * 41 1.1 riastrad * At most MB_CUR_MAX bytes will be stored. 42 1.1 riastrad * 43 1.1 riastrad * References: 44 1.1 riastrad * 45 1.1 riastrad * The Unicode Standard, Version 15.0 -- Core Specification, The 46 1.6 rillig * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 47 1.6 rillig * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 48 1.6 rillig * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 49 1.1 riastrad */ 50 1.1 riastrad 51 1.1 riastrad #include <sys/cdefs.h> 52 1.6 rillig __RCSID("$NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $"); 53 1.2 riastrad 54 1.2 riastrad #include "namespace.h" 55 1.1 riastrad 56 1.1 riastrad #include <sys/types.h> /* broken citrus_*.h */ 57 1.1 riastrad #include <sys/queue.h> /* broken citrus_*.h */ 58 1.1 riastrad 59 1.1 riastrad #include <assert.h> 60 1.1 riastrad #include <errno.h> 61 1.1 riastrad #include <langinfo.h> 62 1.1 riastrad #include <limits.h> 63 1.3 riastrad #include <locale.h> 64 1.1 riastrad #include <paths.h> 65 1.1 riastrad #include <stddef.h> 66 1.1 riastrad #include <stdlib.h> 67 1.1 riastrad #include <uchar.h> 68 1.1 riastrad #include <wchar.h> 69 1.1 riastrad 70 1.1 riastrad #include "citrus_types.h" /* broken citrus_iconv.h */ 71 1.1 riastrad #include "citrus_module.h" /* broken citrus_iconv.h */ 72 1.1 riastrad #include "citrus_hash.h" /* broken citrus_iconv.h */ 73 1.1 riastrad #include "citrus_iconv.h" 74 1.3 riastrad #include "setlocale_local.h" 75 1.1 riastrad 76 1.2 riastrad #ifdef __weak_alias 77 1.2 riastrad __weak_alias(c32rtomb,_c32rtomb) 78 1.3 riastrad __weak_alias(c32rtomb_l,_c32rtomb_l) 79 1.2 riastrad #endif 80 1.2 riastrad 81 1.1 riastrad size_t 82 1.1 riastrad c32rtomb(char *restrict s, char32_t c32, mbstate_t *restrict ps) 83 1.1 riastrad { 84 1.3 riastrad 85 1.3 riastrad return c32rtomb_l(s, c32, ps, _current_locale()); 86 1.3 riastrad } 87 1.3 riastrad 88 1.3 riastrad size_t 89 1.3 riastrad c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps, 90 1.3 riastrad locale_t loc) 91 1.3 riastrad { 92 1.4 riastrad static mbstate_t psbuf; 93 1.1 riastrad struct _citrus_iconv *iconv = NULL; 94 1.4 riastrad char buf[2*MB_LEN_MAX]; /* [shift from init, wc] [shift to init] */ 95 1.4 riastrad char utf32le[4]; 96 1.1 riastrad const char *src; 97 1.1 riastrad char *dst; 98 1.4 riastrad size_t srcleft, dstleft, inval; 99 1.4 riastrad mbstate_t mbrtowcstate = {0}; 100 1.4 riastrad wchar_t wc; 101 1.5 riastrad size_t wc_len; 102 1.4 riastrad size_t len; 103 1.1 riastrad int error, errno_save; 104 1.1 riastrad 105 1.1 riastrad /* 106 1.1 riastrad * Save errno in case _citrus_iconv_* clobbers it. 107 1.1 riastrad */ 108 1.1 riastrad errno_save = errno; 109 1.1 riastrad 110 1.1 riastrad /* 111 1.4 riastrad * `If ps is a null pointer, each function uses its own 112 1.4 riastrad * internal mbstate_t object instead, which is initialized at 113 1.4 riastrad * program startup to the initial conversion state; the 114 1.4 riastrad * functions are not required to avoid data races with other 115 1.4 riastrad * calls to the same function in this case. The 116 1.4 riastrad * implementation behaves as if no library function calls 117 1.4 riastrad * these functions with a null pointer for ps.' 118 1.4 riastrad */ 119 1.4 riastrad if (ps == NULL) 120 1.4 riastrad ps = &psbuf; 121 1.4 riastrad 122 1.4 riastrad /* 123 1.1 riastrad * `If s is a null pointer, the c32rtomb function is equivalent 124 1.1 riastrad * to the call 125 1.1 riastrad * 126 1.1 riastrad * c32rtomb(buf, L'\0', ps) 127 1.1 riastrad * 128 1.1 riastrad * where buf is an internal buffer.' 129 1.1 riastrad */ 130 1.1 riastrad if (s == NULL) { 131 1.1 riastrad s = buf; 132 1.1 riastrad c32 = L'\0'; 133 1.1 riastrad } 134 1.1 riastrad 135 1.1 riastrad /* 136 1.4 riastrad * Reject surrogate code points. We only deal in scalar 137 1.4 riastrad * values. 138 1.4 riastrad * 139 1.4 riastrad * XXX Is this necessary? Won't iconv take care of it for us? 140 1.1 riastrad */ 141 1.1 riastrad if (c32 >= 0xd800 && c32 <= 0xdfff) { 142 1.1 riastrad errno = EILSEQ; 143 1.1 riastrad len = (size_t)-1; 144 1.1 riastrad goto out; 145 1.1 riastrad } 146 1.1 riastrad 147 1.1 riastrad /* 148 1.1 riastrad * Open an iconv handle to convert UTF-32LE to locale-dependent 149 1.1 riastrad * multibyte output. 150 1.1 riastrad */ 151 1.1 riastrad if ((error = _citrus_iconv_open(&iconv, _PATH_ICONV, "utf-32le", 152 1.3 riastrad nl_langinfo_l(CODESET, loc))) != 0) { 153 1.1 riastrad errno = EIO; /* XXX? */ 154 1.1 riastrad len = (size_t)-1; 155 1.1 riastrad goto out; 156 1.1 riastrad } 157 1.1 riastrad 158 1.1 riastrad /* 159 1.4 riastrad * Convert from UTF-32LE to a multibyte sequence. 160 1.1 riastrad */ 161 1.4 riastrad le32enc(utf32le, c32); 162 1.4 riastrad src = utf32le; 163 1.4 riastrad srcleft = sizeof(utf32le); 164 1.4 riastrad dst = buf; 165 1.1 riastrad dstleft = MB_CUR_MAX; 166 1.4 riastrad error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, 167 1.1 riastrad _CITRUS_ICONV_F_HIDE_INVALID, &inval); 168 1.1 riastrad if (error) { /* can't be incomplete, must be error */ 169 1.1 riastrad errno = error; 170 1.1 riastrad len = (size_t)-1; 171 1.1 riastrad goto out; 172 1.1 riastrad } 173 1.1 riastrad _DIAGASSERT(srcleft == 0); 174 1.1 riastrad _DIAGASSERT(dstleft <= MB_CUR_MAX); 175 1.1 riastrad 176 1.1 riastrad /* 177 1.1 riastrad * If we didn't produce any output, that means the scalar value 178 1.1 riastrad * c32 can't be encoded in the current locale, so treat it as 179 1.1 riastrad * EILSEQ. 180 1.1 riastrad */ 181 1.1 riastrad len = MB_CUR_MAX - dstleft; 182 1.1 riastrad if (len == 0) { 183 1.1 riastrad errno = EILSEQ; 184 1.1 riastrad len = (size_t)-1; 185 1.1 riastrad goto out; 186 1.1 riastrad } 187 1.1 riastrad 188 1.1 riastrad /* 189 1.4 riastrad * Now get a wide character out of the buffer. We don't care 190 1.4 riastrad * how much it consumes other than for a diagnostic assertion. 191 1.4 riastrad * It had better return exactly one wide character, because we 192 1.4 riastrad * are only allowed to encode one wide character's worth of 193 1.4 riastrad * multibyte output (possibly including a shift sequence). 194 1.4 riastrad * 195 1.4 riastrad * XXX What about combining characters? 196 1.4 riastrad */ 197 1.5 riastrad wc_len = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc); 198 1.5 riastrad switch (wc_len) { 199 1.5 riastrad case (size_t)-1: /* error, with errno set */ 200 1.5 riastrad len = (size_t)-1; 201 1.4 riastrad goto out; 202 1.4 riastrad case 0: /* decoded NUL */ 203 1.4 riastrad wc = 0; /* paranoia */ 204 1.5 riastrad len = wc_len; 205 1.4 riastrad break; 206 1.4 riastrad default: /* decoded wc */ 207 1.5 riastrad _DIAGASSERT(wc_len <= len); 208 1.4 riastrad } 209 1.4 riastrad 210 1.4 riastrad /* 211 1.4 riastrad * Now put the wide character out, using the caller's 212 1.4 riastrad * conversion state so that we don't output unnecessary shift 213 1.4 riastrad * sequences. 214 1.4 riastrad */ 215 1.4 riastrad len = wcrtomb_l(s, wc, ps, loc); 216 1.4 riastrad if (len == (size_t)-1) /* error, with errno set */ 217 1.4 riastrad goto out; 218 1.4 riastrad 219 1.4 riastrad /* 220 1.1 riastrad * Make sure we preserve errno on success. 221 1.1 riastrad */ 222 1.1 riastrad errno = errno_save; 223 1.1 riastrad 224 1.1 riastrad out: errno_save = errno; 225 1.1 riastrad _citrus_iconv_close(iconv); 226 1.1 riastrad errno = errno_save; 227 1.1 riastrad return len; 228 1.1 riastrad } 229