utils.c revision 1.1 1 1.1 christos /* $NetBSD: utils.c,v 1.1 2025/02/24 13:47:57 christos Exp $ */
2 1.1 christos
3 1.1 christos /*
4 1.1 christos * Redistribution and use in source and binary forms, with or without
5 1.1 christos * modification, are permitted provided that the following conditions
6 1.1 christos * are met:
7 1.1 christos * 1. Redistributions of source code must retain the above copyright
8 1.1 christos * notice, this list of conditions and the following disclaimer.
9 1.1 christos * 2. Redistributions in binary form must reproduce the above copyright
10 1.1 christos * notice, this list of conditions and the following disclaimer in the
11 1.1 christos * documentation and/or other materials provided with the distribution.
12 1.1 christos *
13 1.1 christos * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
14 1.1 christos * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 1.1 christos * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 1.1 christos * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 1.1 christos * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 1.1 christos * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 1.1 christos * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 1.1 christos * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 1.1 christos * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 1.1 christos * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 1.1 christos * SUCH DAMAGE.
24 1.1 christos */
25 1.1 christos
26 1.1 christos #include <sys/cdefs.h>
27 1.1 christos #ifndef lint
28 1.1 christos __RCSID("$NetBSD: utils.c,v 1.1 2025/02/24 13:47:57 christos Exp $");
29 1.1 christos #endif /* not lint */
30 1.1 christos
31 1.1 christos #include <sys/uuid.h>
32 1.1 christos
33 1.1 christos #include <assert.h>
34 1.1 christos #include <ctype.h>
35 1.1 christos #include <err.h>
36 1.1 christos #include <errno.h>
37 1.1 christos #include <inttypes.h>
38 1.1 christos #include <limits.h>
39 1.1 christos #include <stdio.h>
40 1.1 christos #include <stdlib.h>
41 1.1 christos #include <string.h>
42 1.1 christos #include <fcntl.h>
43 1.1 christos #include <unistd.h>
44 1.1 christos #include <util.h>
45 1.1 christos
46 1.1 christos #include "defs.h"
47 1.1 christos #include "utils.h"
48 1.1 christos
49 1.1 christos #define UCS2_REPLACEMENT_CHARACTER 0xfffd
50 1.1 christos
51 1.1 christos /************************************************************************
52 1.1 christos * Character encoding conversion routnes
53 1.1 christos *
54 1.1 christos * UEFI uses UCS-2 character encoding. Note that this is not UTF-16
55 1.1 christos * as it doesn't interpret surrogate pairs.
56 1.1 christos *
57 1.1 christos * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>:
58 1.1 christos *
59 1.1 christos * "UCS-2 is obsolete terminology which refers to a Unicode
60 1.1 christos * implementation up to Unicode 1.1, before surrogate code points and
61 1.1 christos * UTF-16 were added to Version 2.0 of the standard. This term should
62 1.1 christos * now be avoided.
63 1.1 christos *
64 1.1 christos * UCS-2 does not describe a data format distinct from UTF-16, because
65 1.1 christos * both use exactly the same 16-bit code unit
66 1.1 christos * representations. However, UCS-2 does not interpret surrogate code
67 1.1 christos * points, and thus cannot be used to conformantly represent
68 1.1 christos * supplementary characters.
69 1.1 christos *
70 1.1 christos * Sometimes in the past an implementation has been labeled "UCS-2" to
71 1.1 christos * indicate that it does not support supplementary characters and
72 1.1 christos * doesn't interpret pairs of surrogate code points as
73 1.1 christos * characters. Such an implementation would not handle processing of
74 1.1 christos * character properties, code point boundaries, collation, etc. for
75 1.1 christos * supplementary characters, nor would it be able to support most
76 1.1 christos * emoji, for example. [AF]"
77 1.1 christos *
78 1.1 christos * Regarding illegal UTF-8 sequences, the same document says:
79 1.1 christos *
80 1.1 christos * "None of the UTFs can generate every arbitrary byte sequence. For
81 1.1 christos * example, in UTF-8 every byte of the form 110xxxxx_2 must be
82 1.1 christos * followed with a byte of the form 10xxxxxx_2. A sequence such as
83 1.1 christos * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be
84 1.1 christos * generated. When faced with this illegal byte sequence while
85 1.1 christos * transforming or interpreting, a UTF-8 conformant process must treat
86 1.1 christos * the first byte 110xxxxx_2 as an illegal termination error: for
87 1.1 christos * example, either signaling an error, filtering the byte out, or
88 1.1 christos * representing the byte with a marker such as U+FFFD REPLACEMENT
89 1.1 christos * CHARACTER. In the latter two cases, it will continue processing at
90 1.1 christos * the second byte 0xxxxxxx_2.
91 1.1 christos *
92 1.1 christos * A conformant process must not interpret illegal or ill-formed byte
93 1.1 christos * sequences as characters, however, it may take error recovery
94 1.1 christos * actions. No conformant process may use irregular byte sequences to
95 1.1 christos * encode out-of-band information."
96 1.1 christos */
97 1.1 christos
98 1.1 christos /*
99 1.1 christos * ibuf = input buffer (uint16_t *)
100 1.1 christos * isz = bytes in input buffer
101 1.1 christos * obuf = output buffer (char *)
102 1.1 christos * osz = bytes in output buffer (size_t *)
103 1.1 christos *
104 1.1 christos * if (obuf == NULL), malloc obuf.
105 1.1 christos * if (obuf != NULL), write to existing output buffer.
106 1.1 christos *
107 1.1 christos * Return resulting utf8 string.
108 1.1 christos */
109 1.1 christos PUBLIC char *
110 1.1 christos ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz)
111 1.1 christos {
112 1.1 christos uint8_t *dst;
113 1.1 christos size_t dsz, i, j, j_max, n;
114 1.1 christos uint16_t c;
115 1.1 christos
116 1.1 christos assert(isz > 0);
117 1.1 christos
118 1.1 christos if (obuf != NULL) {
119 1.1 christos assert(osz != NULL);
120 1.1 christos dsz = *osz;
121 1.1 christos dst = (uint8_t *)obuf;
122 1.1 christos }
123 1.1 christos else {
124 1.1 christos dsz = isz * sizeof(*dst);
125 1.1 christos dst = malloc(dsz);
126 1.1 christos }
127 1.1 christos
128 1.1 christos /*
129 1.1 christos * Each UCS2 character will encode as at most 3 UTF8 bytes.
130 1.1 christos * 'isz' is the number of bytes in the source buffer which
131 1.1 christos * may well be larger than the UCS2 string. 'osz' is the
132 1.1 christos * actual number of bytes in the NUL terminated USC2 string.
133 1.1 christos */
134 1.1 christos n = isz / sizeof(*ibuf); /* max number of characters in input buffer */
135 1.1 christos j = 0;
136 1.1 christos j_max = dsz / sizeof(*dst);
137 1.1 christos for (i = 0; i < n; i++) {
138 1.1 christos c = le16toh(ibuf[i]);
139 1.1 christos if (c == 0) {
140 1.1 christos break;
141 1.1 christos }
142 1.1 christos if (c < 0x0080) {
143 1.1 christos if (j + 1 >= j_max)
144 1.1 christos break;
145 1.1 christos dst[j++] = (uint8_t)c;
146 1.1 christos }
147 1.1 christos else if (c < 0x0800) {
148 1.1 christos if (j + 2 >= j_max)
149 1.1 christos break;
150 1.1 christos dst[j++] = 0xc0 | (uint8_t)(c >> 6);
151 1.1 christos dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
152 1.1 christos }
153 1.1 christos else {
154 1.1 christos if (j + 3 >= j_max)
155 1.1 christos break;
156 1.1 christos dst[j++] = 0xe0 | (uint8_t)(c >> 12);
157 1.1 christos dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f);
158 1.1 christos dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
159 1.1 christos }
160 1.1 christos }
161 1.1 christos if (dst != NULL)
162 1.1 christos dst[j] = '\0';
163 1.1 christos
164 1.1 christos if (osz)
165 1.1 christos *osz = j;
166 1.1 christos
167 1.1 christos return (char *)dst;
168 1.1 christos }
169 1.1 christos
170 1.1 christos /*
171 1.1 christos * ibuf = input buffer (char *)
172 1.1 christos * isz = bytes in input buffer
173 1.1 christos * obuf = output buffer (uint16_t *)
174 1.1 christos * osz = bytes in output buffer (size_t *)
175 1.1 christos *
176 1.1 christos * if (obuf == NULL), malloc obuf.
177 1.1 christos * if (obuf != NULL), write to existing output buffer.
178 1.1 christos *
179 1.1 christos * Return resulting ucs2 string.
180 1.1 christos */
181 1.1 christos PUBLIC uint16_t *
182 1.1 christos utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz)
183 1.1 christos {
184 1.1 christos const uint8_t *src = (const uint8_t *)ibuf;
185 1.1 christos uint16_t *dst;
186 1.1 christos uint16_t out;
187 1.1 christos size_t dsz, i, j, j_max;
188 1.1 christos
189 1.1 christos if (obuf != NULL) {
190 1.1 christos assert(osz != NULL);
191 1.1 christos dst = obuf;
192 1.1 christos dsz = *osz;
193 1.1 christos }
194 1.1 christos else {
195 1.1 christos dsz = isz * sizeof(*dst);
196 1.1 christos dst = malloc(dsz);
197 1.1 christos }
198 1.1 christos
199 1.1 christos j = 0;
200 1.1 christos j_max = dsz / sizeof(*dst);
201 1.1 christos for (i = 0; i < isz; i++) {
202 1.1 christos out = src[i];
203 1.1 christos if (out == '\0') {
204 1.1 christos break;
205 1.1 christos }
206 1.1 christos else if (j + 1 >= j_max) {
207 1.1 christos break;
208 1.1 christos }
209 1.1 christos else if ((out & 0x80) == 0) {
210 1.1 christos /* we're good to go */
211 1.1 christos }
212 1.1 christos else if ((out & 0xe0) == 0xc0) {
213 1.1 christos if (i + 1 >= isz) { /* insufficient source */
214 1.1 christos break;
215 1.1 christos }
216 1.1 christos if ((src[i + 1] & 0xc0) != 0x80) {
217 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
218 1.1 christos }
219 1.1 christos else {
220 1.1 christos out &= 0x1f;
221 1.1 christos out <<= 6;
222 1.1 christos out |= src[++i] & 0x3f;
223 1.1 christos }
224 1.1 christos }
225 1.1 christos else if ((out & 0xf0) == 0xe0) {
226 1.1 christos if (i + 2 >= isz) { /* insufficient source */
227 1.1 christos break;
228 1.1 christos }
229 1.1 christos if ((src[i + 1] & 0xc0) != 0x80 ||
230 1.1 christos (src[i + 2] & 0xc0) != 0x80) {
231 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
232 1.1 christos }
233 1.1 christos else {
234 1.1 christos out &= 0x0f;
235 1.1 christos out <<= 6;
236 1.1 christos out |= src[++i] & 0x3f;
237 1.1 christos out <<= 6;
238 1.1 christos out |= src[++i] & 0x3f;
239 1.1 christos }
240 1.1 christos }
241 1.1 christos else { /* cannot encode as USC2 */
242 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
243 1.1 christos }
244 1.1 christos dst[j++] = htole16(out);
245 1.1 christos }
246 1.1 christos dst[j] = '\0';
247 1.1 christos
248 1.1 christos if (src[i] != '\0')
249 1.1 christos warnx("bad UTF8 string");
250 1.1 christos
251 1.1 christos if (osz)
252 1.1 christos *osz = (j + 1) * sizeof(*dst);
253 1.1 christos return dst;
254 1.1 christos }
255 1.1 christos
256 1.1 christos
257 1.1 christos PUBLIC size_t
258 1.1 christos utf8_to_ucs2_size(const char *src)
259 1.1 christos {
260 1.1 christos #if 0
261 1.1 christos uint16_t *dst;
262 1.1 christos size_t sz;
263 1.1 christos
264 1.1 christos dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz);
265 1.1 christos free(dst);
266 1.1 christos return sz;
267 1.1 christos #else
268 1.1 christos const uint8_t *buf = (const uint8_t *)src;
269 1.1 christos uint out;
270 1.1 christos size_t i, j;
271 1.1 christos
272 1.1 christos j = 0;
273 1.1 christos for (i = 0; (out = buf[i]) != '\0'; i++) {
274 1.1 christos if ((out & 0x80) == 0) {
275 1.1 christos /* we're good to go */
276 1.1 christos }
277 1.1 christos else if ((out & 0xe0) == 0xc0) {
278 1.1 christos if ((buf[i + 1] & 0xc0) == 0x80) {
279 1.1 christos i++;
280 1.1 christos }
281 1.1 christos }
282 1.1 christos else if ((out & 0xf0) == 0xe0) {
283 1.1 christos if ((buf[i + 1] & 0xc0) == 0x80 &&
284 1.1 christos (buf[i + 2] & 0xc0) == 0x80) {
285 1.1 christos i += 2;
286 1.1 christos }
287 1.1 christos }
288 1.1 christos j++;
289 1.1 christos }
290 1.1 christos
291 1.1 christos return (j + 1) * sizeof(uint16_t);
292 1.1 christos #endif
293 1.1 christos }
294 1.1 christos
295 1.1 christos /************************************************************************
296 1.1 christos * UUID routines
297 1.1 christos */
298 1.1 christos PUBLIC int
299 1.1 christos uuid_scanf(struct uuid *uuid, const char *str)
300 1.1 christos {
301 1.1 christos
302 1.1 christos return sscanf(str,
303 1.1 christos "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
304 1.1 christos &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version,
305 1.1 christos &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low,
306 1.1 christos &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3],
307 1.1 christos &uuid->node[4], &uuid->node[5]);
308 1.1 christos }
309 1.1 christos
310 1.1 christos /*
311 1.1 christos * from sys/kern/kern_uuid.c
312 1.1 christos */
313 1.1 christos PUBLIC int
314 1.1 christos uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid)
315 1.1 christos {
316 1.1 christos
317 1.1 christos return snprintf(buf, sz,
318 1.1 christos "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
319 1.1 christos uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
320 1.1 christos uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
321 1.1 christos uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3],
322 1.1 christos uuid->node[4], uuid->node[5]);
323 1.1 christos }
324 1.1 christos
325 1.1 christos PUBLIC int
326 1.1 christos uuid_printf(const struct uuid *uuid)
327 1.1 christos {
328 1.1 christos char buf[UUID_STR_LEN];
329 1.1 christos
330 1.1 christos (void) uuid_snprintf(buf, sizeof(buf), uuid);
331 1.1 christos printf("%s", buf);
332 1.1 christos return (0);
333 1.1 christos }
334 1.1 christos
335 1.1 christos /************************************************************************
336 1.1 christos * Misc routines
337 1.1 christos */
338 1.1 christos
339 1.1 christos PUBLIC void
340 1.1 christos show_data(const uint8_t *buf, size_t len, const char *prefix)
341 1.1 christos {
342 1.1 christos uint8_t line_buf[17];
343 1.1 christos size_t i;
344 1.1 christos
345 1.1 christos line_buf[16] = '\0';
346 1.1 christos for (i = 0; i < len; i++) {
347 1.1 christos if ((i & 0xf) == 0)
348 1.1 christos printf("%s%08zx: ", prefix, i);
349 1.1 christos line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.';
350 1.1 christos printf("%02x ", buf[i]);
351 1.1 christos if ((i & 0xf) == 0xf)
352 1.1 christos printf(" %s\n", line_buf);
353 1.1 christos else if ((i & 0x7) == 0x7)
354 1.1 christos printf(" ");
355 1.1 christos }
356 1.1 christos i &= 0xf;
357 1.1 christos if (i != 0) {
358 1.1 christos line_buf[i] = '\0';
359 1.1 christos if (i < 8)
360 1.1 christos printf(" ");
361 1.1 christos while (i++ < 16)
362 1.1 christos printf(" ");
363 1.1 christos
364 1.1 christos printf(" %s\n", line_buf);
365 1.1 christos }
366 1.1 christos }
367 1.1 christos
368 1.1 christos PUBLIC uint16_t
369 1.1 christos strtous(const char *str, char **endptr, int base)
370 1.1 christos {
371 1.1 christos uintmax_t val;
372 1.1 christos int rstatus;
373 1.1 christos
374 1.1 christos val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus);
375 1.1 christos
376 1.1 christos switch (rstatus) {
377 1.1 christos case EINVAL:
378 1.1 christos /*###378 [lint] warning constant in conditional context [161]%%%*/
379 1.1 christos assert(0);
380 1.1 christos /*###379 [lint] warning 'break' statement not reached [193]%%%*/
381 1.1 christos break;
382 1.1 christos case ENOTSUP:
383 1.1 christos if (endptr != NULL)
384 1.1 christos break;
385 1.1 christos /*FALLTHROUGH*/
386 1.1 christos case ECANCELED:
387 1.1 christos err(EXIT_FAILURE, "invalid numeric string: %s\n", str);
388 1.1 christos case ERANGE:
389 1.1 christos err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n",
390 1.1 christos USHRT_MAX, str);
391 1.1 christos default:
392 1.1 christos break;
393 1.1 christos }
394 1.1 christos
395 1.1 christos return (uint16_t)val;
396 1.1 christos }
397 1.1 christos
398 1.1 christos char *
399 1.1 christos read_file(const char *fname, size_t *size)
400 1.1 christos {
401 1.1 christos char *buf, *cp, *ep;
402 1.1 christos size_t bufsz, cnt, sz;
403 1.1 christos ssize_t ssz;
404 1.1 christos int fd, fd_flags;
405 1.1 christos
406 1.1 christos assert(fname != NULL);
407 1.1 christos if (fname == NULL)
408 1.1 christos return 0;
409 1.1 christos
410 1.1 christos if (strcmp(fname, "-") == 0) {
411 1.1 christos fd = STDIN_FILENO;
412 1.1 christos if ((fd_flags = fcntl(fd, F_GETFL)) == -1)
413 1.1 christos err(EXIT_FAILURE, "fcntl F_GETFL");
414 1.1 christos
415 1.1 christos if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1)
416 1.1 christos err(EXIT_FAILURE, "fcntl F_SETFL");
417 1.1 christos }
418 1.1 christos else {
419 1.1 christos fd_flags = -1;
420 1.1 christos fd = open(fname, O_RDONLY);
421 1.1 christos if (fd == -1)
422 1.1 christos err(EXIT_FAILURE, "open");
423 1.1 christos }
424 1.1 christos
425 1.1 christos bufsz = 0x800;
426 1.1 christos buf = emalloc(bufsz);
427 1.1 christos cp = buf;
428 1.1 christos ep = buf + bufsz;
429 1.1 christos cnt = 0;
430 1.1 christos for (;;) {
431 1.1 christos ssz = read(fd, cp, (size_t)(ep - cp));
432 1.1 christos if (ssz == -1) {
433 1.1 christos if (errno == EAGAIN)
434 1.1 christos continue;
435 1.1 christos err(EXIT_FAILURE, "read");
436 1.1 christos }
437 1.1 christos assert(ssz >= 0);
438 1.1 christos #if 0
439 1.1 christos printf("ssz: %zd\n", ssz);
440 1.1 christos show_data((uint8_t *)cp, (size_t)ssz, "");
441 1.1 christos #endif
442 1.1 christos if (ssz == 0)
443 1.1 christos break;
444 1.1 christos
445 1.1 christos cp += ssz;
446 1.1 christos sz = (size_t)ssz;
447 1.1 christos cnt += sz;
448 1.1 christos
449 1.1 christos if (cp < ep) {
450 1.1 christos /* XXX: what about UCS-2? */
451 1.1 christos *cp = '\0';
452 1.1 christos cnt++;
453 1.1 christos break;
454 1.1 christos }
455 1.1 christos
456 1.1 christos if (cp == ep) {
457 1.1 christos bufsz *= 2;
458 1.1 christos buf = erealloc(buf, bufsz);
459 1.1 christos cp = buf + cnt;
460 1.1 christos ep = buf + bufsz;
461 1.1 christos }
462 1.1 christos }
463 1.1 christos if (fd_flags != -1)
464 1.1 christos fcntl(fd, F_SETFL, fd_flags);
465 1.1 christos else
466 1.1 christos close(fd);
467 1.1 christos
468 1.1 christos *size = cnt;
469 1.1 christos return buf;
470 1.1 christos }
471