Home | History | Annotate | Line # | Download | only in efi
      1 /* $NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $ */
      2 
      3 /*
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     14  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     16  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     19  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     23  * SUCH DAMAGE.
     24  */
     25 
     26 #include <sys/cdefs.h>
     27 #ifndef lint
     28 __RCSID("$NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $");
     29 #endif /* not lint */
     30 
     31 #include <sys/uuid.h>
     32 
     33 #include <assert.h>
     34 #include <ctype.h>
     35 #include <err.h>
     36 #include <errno.h>
     37 #include <inttypes.h>
     38 #include <limits.h>
     39 #include <stdio.h>
     40 #include <stdlib.h>
     41 #include <string.h>
     42 #include <fcntl.h>
     43 #include <unistd.h>
     44 #include <util.h>
     45 
     46 #include "defs.h"
     47 #include "utils.h"
     48 
     49 #define UCS2_REPLACEMENT_CHARACTER	0xfffd
     50 
     51 /************************************************************************
     52  * Character encoding conversion routnes
     53  *
     54  * UEFI uses UCS-2 character encoding.  Note that this is not UTF-16
     55  * as it doesn't interpret surrogate pairs.
     56  *
     57  * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>:
     58  *
     59  * "UCS-2 is obsolete terminology which refers to a Unicode
     60  * implementation up to Unicode 1.1, before surrogate code points and
     61  * UTF-16 were added to Version 2.0 of the standard. This term should
     62  * now be avoided.
     63  *
     64  * UCS-2 does not describe a data format distinct from UTF-16, because
     65  * both use exactly the same 16-bit code unit
     66  * representations. However, UCS-2 does not interpret surrogate code
     67  * points, and thus cannot be used to conformantly represent
     68  * supplementary characters.
     69  *
     70  * Sometimes in the past an implementation has been labeled "UCS-2" to
     71  * indicate that it does not support supplementary characters and
     72  * doesn't interpret pairs of surrogate code points as
     73  * characters. Such an implementation would not handle processing of
     74  * character properties, code point boundaries, collation, etc. for
     75  * supplementary characters, nor would it be able to support most
     76  * emoji, for example. [AF]"
     77  *
     78  * Regarding illegal UTF-8 sequences, the same document says:
     79  *
     80  * "None of the UTFs can generate every arbitrary byte sequence. For
     81  * example, in UTF-8 every byte of the form 110xxxxx_2 must be
     82  * followed with a byte of the form 10xxxxxx_2. A sequence such as
     83  * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be
     84  * generated. When faced with this illegal byte sequence while
     85  * transforming or interpreting, a UTF-8 conformant process must treat
     86  * the first byte 110xxxxx_2 as an illegal termination error: for
     87  * example, either signaling an error, filtering the byte out, or
     88  * representing the byte with a marker such as U+FFFD REPLACEMENT
     89  * CHARACTER. In the latter two cases, it will continue processing at
     90  * the second byte 0xxxxxxx_2.
     91  *
     92  * A conformant process must not interpret illegal or ill-formed byte
     93  * sequences as characters, however, it may take error recovery
     94  * actions. No conformant process may use irregular byte sequences to
     95  * encode out-of-band information."
     96  */
     97 
     98 /*
     99  * ibuf  = input buffer (uint16_t *)
    100  * isz   = bytes in input buffer
    101  * obuf  = output buffer (char *)
    102  * osz   = bytes in output buffer (size_t *)
    103  *
    104  * if (obuf == NULL), malloc obuf.
    105  * if (obuf != NULL), write to existing output buffer.
    106  *
    107  * Return resulting utf8 string.
    108  */
    109 PUBLIC char *
    110 ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz)
    111 {
    112 	uint8_t *dst;
    113 	size_t dsz, i, j, j_max, n;
    114 	uint16_t c;
    115 
    116 	assert(isz > 0);
    117 
    118 	if (obuf != NULL) {
    119 		assert(osz != NULL);
    120 		dsz = *osz;
    121 		dst = (uint8_t *)obuf;
    122 	}
    123 	else {
    124 		dsz = isz * sizeof(*dst);
    125 		dst = malloc(dsz);
    126 	}
    127 
    128 	/*
    129 	 * Each UCS2 character will encode as at most 3 UTF8 bytes.
    130 	 * 'isz' is the number of bytes in the source buffer which
    131 	 * may well be larger than the UCS2 string.  'osz' is the
    132 	 * actual number of bytes in the NUL terminated USC2 string.
    133 	 */
    134 	n = isz / sizeof(*ibuf); /* max number of characters in input buffer */
    135 	j = 0;
    136 	j_max = dsz / sizeof(*dst);
    137 	for (i = 0; i < n; i++) {
    138 		c = le16toh(ibuf[i]);
    139 		if (c == 0) {
    140 			break;
    141 		}
    142 		if (c < 0x0080) {
    143 			if (j + 1 >= j_max)
    144 				break;
    145 			dst[j++] = (uint8_t)c;
    146 		}
    147 		else if (c < 0x0800) {
    148 			if (j + 2 >= j_max)
    149 				break;
    150 			dst[j++] = 0xc0 | (uint8_t)(c >> 6);
    151 			dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
    152 		}
    153 		else {
    154 			if (j + 3 >= j_max)
    155 				break;
    156 			dst[j++] = 0xe0 | (uint8_t)(c >> 12);
    157 			dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f);
    158 			dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
    159 		}
    160 	}
    161 	if (dst != NULL)
    162 		dst[j] = '\0';
    163 
    164 	if (osz)
    165 		*osz = j;
    166 
    167 	return (char *)dst;
    168 }
    169 
    170 /*
    171  * ibuf  = input buffer (char *)
    172  * isz   = bytes in input buffer
    173  * obuf  = output buffer (uint16_t *)
    174  * osz   = bytes in output buffer (size_t *)
    175  *
    176  * if (obuf == NULL), malloc obuf.
    177  * if (obuf != NULL), write to existing output buffer.
    178  *
    179  * Return resulting ucs2 string.
    180  */
    181 PUBLIC uint16_t *
    182 utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz)
    183 {
    184 	const uint8_t *src = (const uint8_t *)ibuf;
    185 	uint16_t *dst;
    186 	uint16_t out;
    187 	size_t dsz, i, j, j_max;
    188 
    189 	if (obuf != NULL) {
    190 		assert(osz != NULL);
    191 		dst = obuf;
    192 		dsz = *osz;
    193 	}
    194 	else {
    195 		dsz = isz * sizeof(*dst);
    196 		dst = malloc(dsz);
    197 	}
    198 
    199 	j = 0;
    200 	j_max = dsz / sizeof(*dst);
    201 	for (i = 0; i < isz; i++) {
    202 		out = src[i];
    203 		if (out == '\0') {
    204 			break;
    205 		}
    206 		else if (j + 1 >= j_max) {
    207 			break;
    208 		}
    209 		else if ((out & 0x80) == 0) {
    210 			/* we're good to go */
    211 		}
    212 		else if ((out & 0xe0) == 0xc0) {
    213 			if (i + 1 >= isz) { /* insufficient source */
    214 				break;
    215 			}
    216 			if ((src[i + 1] & 0xc0) != 0x80) {
    217 				out = UCS2_REPLACEMENT_CHARACTER;
    218 			}
    219 			else {
    220 				out &= 0x1f;
    221 				out <<= 6;
    222 				out |= src[++i] & 0x3f;
    223 			}
    224 		}
    225 		else if ((out & 0xf0) == 0xe0) {
    226 			if (i + 2 >= isz) { /* insufficient source */
    227 				break;
    228 			}
    229 			if ((src[i + 1] & 0xc0) != 0x80 ||
    230 			    (src[i + 2] & 0xc0) != 0x80) {
    231 				out = UCS2_REPLACEMENT_CHARACTER;
    232 			}
    233 			else {
    234 				out &= 0x0f;
    235 				out <<= 6;
    236 				out |= src[++i] & 0x3f;
    237 				out <<= 6;
    238 				out |= src[++i] & 0x3f;
    239 			}
    240 		}
    241 		else {	/* cannot encode as USC2 */
    242 			out = UCS2_REPLACEMENT_CHARACTER;
    243 		}
    244 		dst[j++] = htole16(out);
    245 	}
    246 	dst[j] = '\0';
    247 
    248 	if (src[i] != '\0')
    249 		warnx("bad UTF8 string");
    250 
    251 	if (osz)
    252 		*osz = (j + 1) * sizeof(*dst);
    253 	return dst;
    254 }
    255 
    256 PUBLIC size_t
    257 utf8_to_ucs2_size(const char *src)
    258 {
    259 #if 0
    260 	uint16_t *dst;
    261 	size_t sz;
    262 
    263 	dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz);
    264 	free(dst);
    265 	return sz;
    266 #else
    267 	const uint8_t *buf = (const uint8_t *)src;
    268 	uint out;
    269 	size_t i, j;
    270 
    271 	j = 0;
    272 	for (i = 0; (out = buf[i]) != '\0'; i++) {
    273 		if ((out & 0x80) == 0) {
    274 			/* we're good to go */
    275 		}
    276 		else if ((out & 0xe0) == 0xc0) {
    277 			if ((buf[i + 1] & 0xc0) == 0x80) {
    278 				i++;
    279 			}
    280 		}
    281 		else if ((out & 0xf0) == 0xe0) {
    282 			if ((buf[i + 1] & 0xc0) == 0x80 &&
    283 			    (buf[i + 2] & 0xc0) == 0x80) {
    284 				i += 2;
    285 			}
    286 		}
    287 		j++;
    288 	}
    289 
    290 	return (j + 1) * sizeof(uint16_t);
    291 #endif
    292 }
    293 
    294 /************************************************************************
    295  * UUID routines
    296  */
    297 PUBLIC int
    298 uuid_scanf(struct uuid *uuid, const char *str)
    299 {
    300 
    301 	return sscanf(str,
    302 	    "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
    303 	    &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version,
    304 	    &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low,
    305 	    &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3],
    306 	    &uuid->node[4], &uuid->node[5]);
    307 }
    308 
    309 /*
    310  * from sys/kern/kern_uuid.c
    311  */
    312 PUBLIC int
    313 uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid)
    314 {
    315 
    316 	return snprintf(buf, sz,
    317 	    "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
    318 	    uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
    319 	    uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
    320 	    uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3],
    321 	    uuid->node[4], uuid->node[5]);
    322 }
    323 
    324 PUBLIC int
    325 uuid_printf(const struct uuid *uuid)
    326 {
    327 	char buf[UUID_STR_LEN];
    328 
    329 	(void) uuid_snprintf(buf, sizeof(buf), uuid);
    330 	printf("%s", buf);
    331 	return (0);
    332 }
    333 
    334 /************************************************************************
    335  * Misc routines
    336  */
    337 
    338 PUBLIC void
    339 show_data(const uint8_t *buf, size_t len, const char *prefix)
    340 {
    341 	uint8_t line_buf[17];
    342 	size_t i;
    343 
    344 	line_buf[16] = '\0';
    345 	for (i = 0; i < len; i++) {
    346 		if ((i & 0xf) == 0)
    347 			printf("%s%08zx: ", prefix, i);
    348 		line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.';
    349 		printf("%02x ", buf[i]);
    350 		if ((i & 0xf) == 0xf)
    351 			printf("  %s\n", line_buf);
    352 		else if ((i & 0x7) == 0x7)
    353 			printf(" ");
    354 	}
    355 	i &= 0xf;
    356 	if (i != 0) {
    357 		line_buf[i] = '\0';
    358 		if (i < 8)
    359 			printf(" ");
    360 		while (i++ < 16)
    361 			printf("   ");
    362 
    363 		printf("  %s\n", line_buf);
    364 	}
    365 }
    366 
    367 PUBLIC uint16_t
    368 strtous(const char *str, char **endptr, int base)
    369 {
    370 	uintmax_t val;
    371 	int rstatus;
    372 
    373 	val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus);
    374 
    375 	switch (rstatus) {
    376 	case EINVAL:
    377 		assert(0);
    378 		break;
    379 	case ENOTSUP:
    380 		if (endptr != NULL)
    381 			break;
    382 		/*FALLTHROUGH*/
    383 	case ECANCELED:
    384 		err(EXIT_FAILURE, "invalid numeric string: %s\n", str);
    385 	case ERANGE:
    386 		err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n",
    387 		    USHRT_MAX, str);
    388 	default:
    389 		break;
    390 	}
    391 
    392 	return (uint16_t)val;
    393 }
    394 
    395 char *
    396 read_file(const char *fname, size_t *size)
    397 {
    398 	char *buf, *cp, *ep;
    399 	size_t bufsz, cnt, sz;
    400 	ssize_t ssz;
    401 	int fd, fd_flags;
    402 
    403 	assert(fname != NULL);
    404 	if (fname == NULL)
    405 		return 0;
    406 
    407 	if (strcmp(fname, "-") == 0) {
    408 		fd = STDIN_FILENO;
    409 		if ((fd_flags = fcntl(fd, F_GETFL)) == -1)
    410 			err(EXIT_FAILURE, "fcntl F_GETFL");
    411 
    412 		if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1)
    413 			err(EXIT_FAILURE, "fcntl F_SETFL");
    414 	}
    415 	else {
    416 		fd_flags = -1;
    417 		fd = open(fname, O_RDONLY);
    418 		if (fd == -1)
    419 			err(EXIT_FAILURE, "open");
    420 	}
    421 
    422 	bufsz = 0x800;
    423 	buf = emalloc(bufsz);
    424 	cp = buf;
    425 	ep = buf + bufsz;
    426 	cnt = 0;
    427 	for (;;) {
    428 		ssz = read(fd, cp, (size_t)(ep - cp));
    429 		if (ssz == -1) {
    430 			if (errno == EAGAIN)
    431 				continue;
    432 			err(EXIT_FAILURE, "read");
    433 		}
    434 		assert(ssz >= 0);
    435 #if 0
    436 		printf("ssz: %zd\n", ssz);
    437 		show_data((uint8_t *)cp, (size_t)ssz, "");
    438 #endif
    439 		if (ssz == 0)
    440 			break;
    441 
    442 		cp += ssz;
    443 		sz = (size_t)ssz;
    444 		cnt += sz;
    445 
    446 		if (cp < ep) {
    447 			/* XXX: what about UCS-2? */
    448 			*cp = '\0';
    449 			cnt++;
    450 			break;
    451 		}
    452 
    453 		if (cp == ep) {
    454 			bufsz *= 2;
    455 			buf = erealloc(buf, bufsz);
    456 			cp = buf + cnt;
    457 			ep = buf + bufsz;
    458 		}
    459 	}
    460 	if (fd_flags != -1)
    461 		fcntl(fd, F_SETFL, fd_flags);
    462 	else
    463 		close(fd);
    464 
    465 	*size = cnt;
    466 	return buf;
    467 }
    468