1 /* $NetBSD: utils.c,v 1.4 2026/01/06 10:54:41 nia Exp $ */ 2 3 /* 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 14 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include <sys/cdefs.h> 27 #ifndef lint 28 __RCSID("$NetBSD: utils.c,v 1.4 2026/01/06 10:54:41 nia Exp $"); 29 #endif /* not lint */ 30 31 #include <sys/endian.h> 32 #include <sys/uuid.h> 33 34 #include <assert.h> 35 #include <ctype.h> 36 #include <err.h> 37 #include <errno.h> 38 #include <inttypes.h> 39 #include <limits.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <fcntl.h> 44 #include <unistd.h> 45 #include <util.h> 46 47 #include "defs.h" 48 #include "utils.h" 49 50 #define UCS2_REPLACEMENT_CHARACTER 0xfffd 51 52 /************************************************************************ 53 * Character encoding conversion routnes 54 * 55 * UEFI uses UCS-2 character encoding. Note that this is not UTF-16 56 * as it doesn't interpret surrogate pairs. 57 * 58 * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>: 59 * 60 * "UCS-2 is obsolete terminology which refers to a Unicode 61 * implementation up to Unicode 1.1, before surrogate code points and 62 * UTF-16 were added to Version 2.0 of the standard. This term should 63 * now be avoided. 64 * 65 * UCS-2 does not describe a data format distinct from UTF-16, because 66 * both use exactly the same 16-bit code unit 67 * representations. However, UCS-2 does not interpret surrogate code 68 * points, and thus cannot be used to conformantly represent 69 * supplementary characters. 70 * 71 * Sometimes in the past an implementation has been labeled "UCS-2" to 72 * indicate that it does not support supplementary characters and 73 * doesn't interpret pairs of surrogate code points as 74 * characters. Such an implementation would not handle processing of 75 * character properties, code point boundaries, collation, etc. for 76 * supplementary characters, nor would it be able to support most 77 * emoji, for example. [AF]" 78 * 79 * Regarding illegal UTF-8 sequences, the same document says: 80 * 81 * "None of the UTFs can generate every arbitrary byte sequence. For 82 * example, in UTF-8 every byte of the form 110xxxxx_2 must be 83 * followed with a byte of the form 10xxxxxx_2. A sequence such as 84 * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be 85 * generated. When faced with this illegal byte sequence while 86 * transforming or interpreting, a UTF-8 conformant process must treat 87 * the first byte 110xxxxx_2 as an illegal termination error: for 88 * example, either signaling an error, filtering the byte out, or 89 * representing the byte with a marker such as U+FFFD REPLACEMENT 90 * CHARACTER. In the latter two cases, it will continue processing at 91 * the second byte 0xxxxxxx_2. 92 * 93 * A conformant process must not interpret illegal or ill-formed byte 94 * sequences as characters, however, it may take error recovery 95 * actions. No conformant process may use irregular byte sequences to 96 * encode out-of-band information." 97 */ 98 99 /* 100 * ibuf = input buffer (uint16_t *) 101 * isz = bytes in input buffer 102 * obuf = output buffer (char *) 103 * osz = bytes in output buffer (size_t *) 104 * 105 * if (obuf == NULL), malloc obuf. 106 * if (obuf != NULL), write to existing output buffer. 107 * 108 * Return resulting utf8 string. 109 */ 110 PUBLIC char * 111 ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz) 112 { 113 uint8_t *dst; 114 size_t dsz, i, j, j_max, n; 115 uint16_t c; 116 117 assert(isz > 0); 118 119 if (obuf != NULL) { 120 assert(osz != NULL); 121 dsz = *osz; 122 dst = (uint8_t *)obuf; 123 } 124 else { 125 dsz = isz * sizeof(*dst); 126 dst = malloc(dsz); 127 } 128 129 /* 130 * Each UCS2 character will encode as at most 3 UTF8 bytes. 131 * 'isz' is the number of bytes in the source buffer which 132 * may well be larger than the UCS2 string. 'osz' is the 133 * actual number of bytes in the NUL terminated USC2 string. 134 */ 135 n = isz / sizeof(*ibuf); /* max number of characters in input buffer */ 136 j = 0; 137 j_max = dsz / sizeof(*dst); 138 for (i = 0; i < n; i++) { 139 c = le16toh(ibuf[i]); 140 if (c == 0) { 141 break; 142 } 143 if (c < 0x0080) { 144 if (j + 1 >= j_max) 145 break; 146 dst[j++] = (uint8_t)c; 147 } 148 else if (c < 0x0800) { 149 if (j + 2 >= j_max) 150 break; 151 dst[j++] = 0xc0 | (uint8_t)(c >> 6); 152 dst[j++] = 0x80 | (uint8_t)(c & 0x3f); 153 } 154 else { 155 if (j + 3 >= j_max) 156 break; 157 dst[j++] = 0xe0 | (uint8_t)(c >> 12); 158 dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f); 159 dst[j++] = 0x80 | (uint8_t)(c & 0x3f); 160 } 161 } 162 if (dst != NULL) 163 dst[j] = '\0'; 164 165 if (osz) 166 *osz = j; 167 168 return (char *)dst; 169 } 170 171 /* 172 * ibuf = input buffer (char *) 173 * isz = bytes in input buffer 174 * obuf = output buffer (uint16_t *) 175 * osz = bytes in output buffer (size_t *) 176 * 177 * if (obuf == NULL), malloc obuf. 178 * if (obuf != NULL), write to existing output buffer. 179 * 180 * Return resulting ucs2 string. 181 */ 182 PUBLIC uint16_t * 183 utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz) 184 { 185 const uint8_t *src = (const uint8_t *)ibuf; 186 uint16_t *dst; 187 uint16_t out; 188 size_t dsz, i, j, j_max; 189 190 if (obuf != NULL) { 191 assert(osz != NULL); 192 dst = obuf; 193 dsz = *osz; 194 } 195 else { 196 dsz = isz * sizeof(*dst); 197 dst = malloc(dsz); 198 } 199 200 j = 0; 201 j_max = dsz / sizeof(*dst); 202 for (i = 0; i < isz; i++) { 203 out = src[i]; 204 if (out == '\0') { 205 break; 206 } 207 else if (j + 1 >= j_max) { 208 break; 209 } 210 else if ((out & 0x80) == 0) { 211 /* we're good to go */ 212 } 213 else if ((out & 0xe0) == 0xc0) { 214 if (i + 1 >= isz) { /* insufficient source */ 215 break; 216 } 217 if ((src[i + 1] & 0xc0) != 0x80) { 218 out = UCS2_REPLACEMENT_CHARACTER; 219 } 220 else { 221 out &= 0x1f; 222 out <<= 6; 223 out |= src[++i] & 0x3f; 224 } 225 } 226 else if ((out & 0xf0) == 0xe0) { 227 if (i + 2 >= isz) { /* insufficient source */ 228 break; 229 } 230 if ((src[i + 1] & 0xc0) != 0x80 || 231 (src[i + 2] & 0xc0) != 0x80) { 232 out = UCS2_REPLACEMENT_CHARACTER; 233 } 234 else { 235 out &= 0x0f; 236 out <<= 6; 237 out |= src[++i] & 0x3f; 238 out <<= 6; 239 out |= src[++i] & 0x3f; 240 } 241 } 242 else { /* cannot encode as USC2 */ 243 out = UCS2_REPLACEMENT_CHARACTER; 244 } 245 dst[j++] = htole16(out); 246 } 247 dst[j] = '\0'; 248 249 if (src[i] != '\0') 250 warnx("bad UTF8 string"); 251 252 if (osz) 253 *osz = (j + 1) * sizeof(*dst); 254 return dst; 255 } 256 257 PUBLIC size_t 258 utf8_to_ucs2_size(const char *src) 259 { 260 #if 0 261 uint16_t *dst; 262 size_t sz; 263 264 dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz); 265 free(dst); 266 return sz; 267 #else 268 const uint8_t *buf = (const uint8_t *)src; 269 uint out; 270 size_t i, j; 271 272 j = 0; 273 for (i = 0; (out = buf[i]) != '\0'; i++) { 274 if ((out & 0x80) == 0) { 275 /* we're good to go */ 276 } 277 else if ((out & 0xe0) == 0xc0) { 278 if ((buf[i + 1] & 0xc0) == 0x80) { 279 i++; 280 } 281 } 282 else if ((out & 0xf0) == 0xe0) { 283 if ((buf[i + 1] & 0xc0) == 0x80 && 284 (buf[i + 2] & 0xc0) == 0x80) { 285 i += 2; 286 } 287 } 288 j++; 289 } 290 291 return (j + 1) * sizeof(uint16_t); 292 #endif 293 } 294 295 /************************************************************************ 296 * UUID routines 297 */ 298 PUBLIC int 299 uuid_scanf(struct uuid *uuid, const char *str) 300 { 301 302 return sscanf(str, 303 "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", 304 &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version, 305 &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low, 306 &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3], 307 &uuid->node[4], &uuid->node[5]); 308 } 309 310 /* 311 * from sys/kern/kern_uuid.c 312 */ 313 PUBLIC int 314 uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid) 315 { 316 317 return snprintf(buf, sz, 318 "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", 319 uuid->time_low, uuid->time_mid, uuid->time_hi_and_version, 320 uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low, 321 uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3], 322 uuid->node[4], uuid->node[5]); 323 } 324 325 PUBLIC int 326 uuid_printf(const struct uuid *uuid) 327 { 328 char buf[UUID_STR_LEN]; 329 330 (void) uuid_snprintf(buf, sizeof(buf), uuid); 331 printf("%s", buf); 332 return (0); 333 } 334 335 /************************************************************************ 336 * Misc routines 337 */ 338 339 PUBLIC void 340 show_data(const uint8_t *buf, size_t len, const char *prefix) 341 { 342 uint8_t line_buf[17]; 343 size_t i; 344 345 line_buf[16] = '\0'; 346 for (i = 0; i < len; i++) { 347 if ((i & 0xf) == 0) 348 printf("%s%08zx: ", prefix, i); 349 line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.'; 350 printf("%02x ", buf[i]); 351 if ((i & 0xf) == 0xf) 352 printf(" %s\n", line_buf); 353 else if ((i & 0x7) == 0x7) 354 printf(" "); 355 } 356 i &= 0xf; 357 if (i != 0) { 358 line_buf[i] = '\0'; 359 if (i < 8) 360 printf(" "); 361 while (i++ < 16) 362 printf(" "); 363 364 printf(" %s\n", line_buf); 365 } 366 } 367 368 PUBLIC uint16_t 369 strtous(const char *str, char **endptr, int base) 370 { 371 uintmax_t val; 372 int rstatus; 373 374 val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus); 375 376 switch (rstatus) { 377 case EINVAL: 378 assert(0); 379 break; 380 case ENOTSUP: 381 if (endptr != NULL) 382 break; 383 /*FALLTHROUGH*/ 384 case ECANCELED: 385 err(EXIT_FAILURE, "invalid numeric string: %s\n", str); 386 case ERANGE: 387 err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n", 388 USHRT_MAX, str); 389 default: 390 break; 391 } 392 393 return (uint16_t)val; 394 } 395 396 char * 397 read_file(const char *fname, size_t *size) 398 { 399 char *buf, *cp, *ep; 400 size_t bufsz, cnt, sz; 401 ssize_t ssz; 402 int fd, fd_flags; 403 404 assert(fname != NULL); 405 if (fname == NULL) 406 return 0; 407 408 if (strcmp(fname, "-") == 0) { 409 fd = STDIN_FILENO; 410 if ((fd_flags = fcntl(fd, F_GETFL)) == -1) 411 err(EXIT_FAILURE, "fcntl F_GETFL"); 412 413 if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1) 414 err(EXIT_FAILURE, "fcntl F_SETFL"); 415 } 416 else { 417 fd_flags = -1; 418 fd = open(fname, O_RDONLY); 419 if (fd == -1) 420 err(EXIT_FAILURE, "open"); 421 } 422 423 bufsz = 0x800; 424 buf = emalloc(bufsz); 425 cp = buf; 426 ep = buf + bufsz; 427 cnt = 0; 428 for (;;) { 429 ssz = read(fd, cp, (size_t)(ep - cp)); 430 if (ssz == -1) { 431 if (errno == EAGAIN) 432 continue; 433 err(EXIT_FAILURE, "read"); 434 } 435 assert(ssz >= 0); 436 #if 0 437 printf("ssz: %zd\n", ssz); 438 show_data((uint8_t *)cp, (size_t)ssz, ""); 439 #endif 440 if (ssz == 0) 441 break; 442 443 cp += ssz; 444 sz = (size_t)ssz; 445 cnt += sz; 446 447 if (cp < ep) { 448 /* XXX: what about UCS-2? */ 449 *cp = '\0'; 450 cnt++; 451 break; 452 } 453 454 if (cp == ep) { 455 bufsz *= 2; 456 buf = erealloc(buf, bufsz); 457 cp = buf + cnt; 458 ep = buf + bufsz; 459 } 460 } 461 if (fd_flags != -1) 462 fcntl(fd, F_SETFL, fd_flags); 463 else 464 close(fd); 465 466 *size = cnt; 467 return buf; 468 } 469