1 /* $NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $ */ 2 3 /* 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 14 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include <sys/cdefs.h> 27 #ifndef lint 28 __RCSID("$NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $"); 29 #endif /* not lint */ 30 31 #include <sys/uuid.h> 32 33 #include <assert.h> 34 #include <ctype.h> 35 #include <err.h> 36 #include <errno.h> 37 #include <inttypes.h> 38 #include <limits.h> 39 #include <stdio.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <fcntl.h> 43 #include <unistd.h> 44 #include <util.h> 45 46 #include "defs.h" 47 #include "utils.h" 48 49 #define UCS2_REPLACEMENT_CHARACTER 0xfffd 50 51 /************************************************************************ 52 * Character encoding conversion routnes 53 * 54 * UEFI uses UCS-2 character encoding. Note that this is not UTF-16 55 * as it doesn't interpret surrogate pairs. 56 * 57 * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>: 58 * 59 * "UCS-2 is obsolete terminology which refers to a Unicode 60 * implementation up to Unicode 1.1, before surrogate code points and 61 * UTF-16 were added to Version 2.0 of the standard. This term should 62 * now be avoided. 63 * 64 * UCS-2 does not describe a data format distinct from UTF-16, because 65 * both use exactly the same 16-bit code unit 66 * representations. However, UCS-2 does not interpret surrogate code 67 * points, and thus cannot be used to conformantly represent 68 * supplementary characters. 69 * 70 * Sometimes in the past an implementation has been labeled "UCS-2" to 71 * indicate that it does not support supplementary characters and 72 * doesn't interpret pairs of surrogate code points as 73 * characters. Such an implementation would not handle processing of 74 * character properties, code point boundaries, collation, etc. for 75 * supplementary characters, nor would it be able to support most 76 * emoji, for example. [AF]" 77 * 78 * Regarding illegal UTF-8 sequences, the same document says: 79 * 80 * "None of the UTFs can generate every arbitrary byte sequence. For 81 * example, in UTF-8 every byte of the form 110xxxxx_2 must be 82 * followed with a byte of the form 10xxxxxx_2. A sequence such as 83 * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be 84 * generated. When faced with this illegal byte sequence while 85 * transforming or interpreting, a UTF-8 conformant process must treat 86 * the first byte 110xxxxx_2 as an illegal termination error: for 87 * example, either signaling an error, filtering the byte out, or 88 * representing the byte with a marker such as U+FFFD REPLACEMENT 89 * CHARACTER. In the latter two cases, it will continue processing at 90 * the second byte 0xxxxxxx_2. 91 * 92 * A conformant process must not interpret illegal or ill-formed byte 93 * sequences as characters, however, it may take error recovery 94 * actions. No conformant process may use irregular byte sequences to 95 * encode out-of-band information." 96 */ 97 98 /* 99 * ibuf = input buffer (uint16_t *) 100 * isz = bytes in input buffer 101 * obuf = output buffer (char *) 102 * osz = bytes in output buffer (size_t *) 103 * 104 * if (obuf == NULL), malloc obuf. 105 * if (obuf != NULL), write to existing output buffer. 106 * 107 * Return resulting utf8 string. 108 */ 109 PUBLIC char * 110 ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz) 111 { 112 uint8_t *dst; 113 size_t dsz, i, j, j_max, n; 114 uint16_t c; 115 116 assert(isz > 0); 117 118 if (obuf != NULL) { 119 assert(osz != NULL); 120 dsz = *osz; 121 dst = (uint8_t *)obuf; 122 } 123 else { 124 dsz = isz * sizeof(*dst); 125 dst = malloc(dsz); 126 } 127 128 /* 129 * Each UCS2 character will encode as at most 3 UTF8 bytes. 130 * 'isz' is the number of bytes in the source buffer which 131 * may well be larger than the UCS2 string. 'osz' is the 132 * actual number of bytes in the NUL terminated USC2 string. 133 */ 134 n = isz / sizeof(*ibuf); /* max number of characters in input buffer */ 135 j = 0; 136 j_max = dsz / sizeof(*dst); 137 for (i = 0; i < n; i++) { 138 c = le16toh(ibuf[i]); 139 if (c == 0) { 140 break; 141 } 142 if (c < 0x0080) { 143 if (j + 1 >= j_max) 144 break; 145 dst[j++] = (uint8_t)c; 146 } 147 else if (c < 0x0800) { 148 if (j + 2 >= j_max) 149 break; 150 dst[j++] = 0xc0 | (uint8_t)(c >> 6); 151 dst[j++] = 0x80 | (uint8_t)(c & 0x3f); 152 } 153 else { 154 if (j + 3 >= j_max) 155 break; 156 dst[j++] = 0xe0 | (uint8_t)(c >> 12); 157 dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f); 158 dst[j++] = 0x80 | (uint8_t)(c & 0x3f); 159 } 160 } 161 if (dst != NULL) 162 dst[j] = '\0'; 163 164 if (osz) 165 *osz = j; 166 167 return (char *)dst; 168 } 169 170 /* 171 * ibuf = input buffer (char *) 172 * isz = bytes in input buffer 173 * obuf = output buffer (uint16_t *) 174 * osz = bytes in output buffer (size_t *) 175 * 176 * if (obuf == NULL), malloc obuf. 177 * if (obuf != NULL), write to existing output buffer. 178 * 179 * Return resulting ucs2 string. 180 */ 181 PUBLIC uint16_t * 182 utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz) 183 { 184 const uint8_t *src = (const uint8_t *)ibuf; 185 uint16_t *dst; 186 uint16_t out; 187 size_t dsz, i, j, j_max; 188 189 if (obuf != NULL) { 190 assert(osz != NULL); 191 dst = obuf; 192 dsz = *osz; 193 } 194 else { 195 dsz = isz * sizeof(*dst); 196 dst = malloc(dsz); 197 } 198 199 j = 0; 200 j_max = dsz / sizeof(*dst); 201 for (i = 0; i < isz; i++) { 202 out = src[i]; 203 if (out == '\0') { 204 break; 205 } 206 else if (j + 1 >= j_max) { 207 break; 208 } 209 else if ((out & 0x80) == 0) { 210 /* we're good to go */ 211 } 212 else if ((out & 0xe0) == 0xc0) { 213 if (i + 1 >= isz) { /* insufficient source */ 214 break; 215 } 216 if ((src[i + 1] & 0xc0) != 0x80) { 217 out = UCS2_REPLACEMENT_CHARACTER; 218 } 219 else { 220 out &= 0x1f; 221 out <<= 6; 222 out |= src[++i] & 0x3f; 223 } 224 } 225 else if ((out & 0xf0) == 0xe0) { 226 if (i + 2 >= isz) { /* insufficient source */ 227 break; 228 } 229 if ((src[i + 1] & 0xc0) != 0x80 || 230 (src[i + 2] & 0xc0) != 0x80) { 231 out = UCS2_REPLACEMENT_CHARACTER; 232 } 233 else { 234 out &= 0x0f; 235 out <<= 6; 236 out |= src[++i] & 0x3f; 237 out <<= 6; 238 out |= src[++i] & 0x3f; 239 } 240 } 241 else { /* cannot encode as USC2 */ 242 out = UCS2_REPLACEMENT_CHARACTER; 243 } 244 dst[j++] = htole16(out); 245 } 246 dst[j] = '\0'; 247 248 if (src[i] != '\0') 249 warnx("bad UTF8 string"); 250 251 if (osz) 252 *osz = (j + 1) * sizeof(*dst); 253 return dst; 254 } 255 256 PUBLIC size_t 257 utf8_to_ucs2_size(const char *src) 258 { 259 #if 0 260 uint16_t *dst; 261 size_t sz; 262 263 dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz); 264 free(dst); 265 return sz; 266 #else 267 const uint8_t *buf = (const uint8_t *)src; 268 uint out; 269 size_t i, j; 270 271 j = 0; 272 for (i = 0; (out = buf[i]) != '\0'; i++) { 273 if ((out & 0x80) == 0) { 274 /* we're good to go */ 275 } 276 else if ((out & 0xe0) == 0xc0) { 277 if ((buf[i + 1] & 0xc0) == 0x80) { 278 i++; 279 } 280 } 281 else if ((out & 0xf0) == 0xe0) { 282 if ((buf[i + 1] & 0xc0) == 0x80 && 283 (buf[i + 2] & 0xc0) == 0x80) { 284 i += 2; 285 } 286 } 287 j++; 288 } 289 290 return (j + 1) * sizeof(uint16_t); 291 #endif 292 } 293 294 /************************************************************************ 295 * UUID routines 296 */ 297 PUBLIC int 298 uuid_scanf(struct uuid *uuid, const char *str) 299 { 300 301 return sscanf(str, 302 "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx", 303 &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version, 304 &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low, 305 &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3], 306 &uuid->node[4], &uuid->node[5]); 307 } 308 309 /* 310 * from sys/kern/kern_uuid.c 311 */ 312 PUBLIC int 313 uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid) 314 { 315 316 return snprintf(buf, sz, 317 "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", 318 uuid->time_low, uuid->time_mid, uuid->time_hi_and_version, 319 uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low, 320 uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3], 321 uuid->node[4], uuid->node[5]); 322 } 323 324 PUBLIC int 325 uuid_printf(const struct uuid *uuid) 326 { 327 char buf[UUID_STR_LEN]; 328 329 (void) uuid_snprintf(buf, sizeof(buf), uuid); 330 printf("%s", buf); 331 return (0); 332 } 333 334 /************************************************************************ 335 * Misc routines 336 */ 337 338 PUBLIC void 339 show_data(const uint8_t *buf, size_t len, const char *prefix) 340 { 341 uint8_t line_buf[17]; 342 size_t i; 343 344 line_buf[16] = '\0'; 345 for (i = 0; i < len; i++) { 346 if ((i & 0xf) == 0) 347 printf("%s%08zx: ", prefix, i); 348 line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.'; 349 printf("%02x ", buf[i]); 350 if ((i & 0xf) == 0xf) 351 printf(" %s\n", line_buf); 352 else if ((i & 0x7) == 0x7) 353 printf(" "); 354 } 355 i &= 0xf; 356 if (i != 0) { 357 line_buf[i] = '\0'; 358 if (i < 8) 359 printf(" "); 360 while (i++ < 16) 361 printf(" "); 362 363 printf(" %s\n", line_buf); 364 } 365 } 366 367 PUBLIC uint16_t 368 strtous(const char *str, char **endptr, int base) 369 { 370 uintmax_t val; 371 int rstatus; 372 373 val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus); 374 375 switch (rstatus) { 376 case EINVAL: 377 assert(0); 378 break; 379 case ENOTSUP: 380 if (endptr != NULL) 381 break; 382 /*FALLTHROUGH*/ 383 case ECANCELED: 384 err(EXIT_FAILURE, "invalid numeric string: %s\n", str); 385 case ERANGE: 386 err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n", 387 USHRT_MAX, str); 388 default: 389 break; 390 } 391 392 return (uint16_t)val; 393 } 394 395 char * 396 read_file(const char *fname, size_t *size) 397 { 398 char *buf, *cp, *ep; 399 size_t bufsz, cnt, sz; 400 ssize_t ssz; 401 int fd, fd_flags; 402 403 assert(fname != NULL); 404 if (fname == NULL) 405 return 0; 406 407 if (strcmp(fname, "-") == 0) { 408 fd = STDIN_FILENO; 409 if ((fd_flags = fcntl(fd, F_GETFL)) == -1) 410 err(EXIT_FAILURE, "fcntl F_GETFL"); 411 412 if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1) 413 err(EXIT_FAILURE, "fcntl F_SETFL"); 414 } 415 else { 416 fd_flags = -1; 417 fd = open(fname, O_RDONLY); 418 if (fd == -1) 419 err(EXIT_FAILURE, "open"); 420 } 421 422 bufsz = 0x800; 423 buf = emalloc(bufsz); 424 cp = buf; 425 ep = buf + bufsz; 426 cnt = 0; 427 for (;;) { 428 ssz = read(fd, cp, (size_t)(ep - cp)); 429 if (ssz == -1) { 430 if (errno == EAGAIN) 431 continue; 432 err(EXIT_FAILURE, "read"); 433 } 434 assert(ssz >= 0); 435 #if 0 436 printf("ssz: %zd\n", ssz); 437 show_data((uint8_t *)cp, (size_t)ssz, ""); 438 #endif 439 if (ssz == 0) 440 break; 441 442 cp += ssz; 443 sz = (size_t)ssz; 444 cnt += sz; 445 446 if (cp < ep) { 447 /* XXX: what about UCS-2? */ 448 *cp = '\0'; 449 cnt++; 450 break; 451 } 452 453 if (cp == ep) { 454 bufsz *= 2; 455 buf = erealloc(buf, bufsz); 456 cp = buf + cnt; 457 ep = buf + bufsz; 458 } 459 } 460 if (fd_flags != -1) 461 fcntl(fd, F_SETFL, fd_flags); 462 else 463 close(fd); 464 465 *size = cnt; 466 return buf; 467 } 468