utils.c revision 1.3 1 1.3 riastrad /* $NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $ */
2 1.1 christos
3 1.1 christos /*
4 1.1 christos * Redistribution and use in source and binary forms, with or without
5 1.1 christos * modification, are permitted provided that the following conditions
6 1.1 christos * are met:
7 1.1 christos * 1. Redistributions of source code must retain the above copyright
8 1.1 christos * notice, this list of conditions and the following disclaimer.
9 1.1 christos * 2. Redistributions in binary form must reproduce the above copyright
10 1.1 christos * notice, this list of conditions and the following disclaimer in the
11 1.1 christos * documentation and/or other materials provided with the distribution.
12 1.1 christos *
13 1.1 christos * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
14 1.1 christos * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 1.1 christos * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 1.1 christos * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 1.1 christos * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 1.1 christos * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 1.1 christos * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 1.1 christos * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 1.1 christos * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 1.1 christos * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 1.1 christos * SUCH DAMAGE.
24 1.1 christos */
25 1.1 christos
26 1.1 christos #include <sys/cdefs.h>
27 1.1 christos #ifndef lint
28 1.3 riastrad __RCSID("$NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $");
29 1.1 christos #endif /* not lint */
30 1.1 christos
31 1.1 christos #include <sys/uuid.h>
32 1.1 christos
33 1.1 christos #include <assert.h>
34 1.1 christos #include <ctype.h>
35 1.1 christos #include <err.h>
36 1.1 christos #include <errno.h>
37 1.1 christos #include <inttypes.h>
38 1.1 christos #include <limits.h>
39 1.1 christos #include <stdio.h>
40 1.1 christos #include <stdlib.h>
41 1.1 christos #include <string.h>
42 1.1 christos #include <fcntl.h>
43 1.1 christos #include <unistd.h>
44 1.1 christos #include <util.h>
45 1.1 christos
46 1.1 christos #include "defs.h"
47 1.1 christos #include "utils.h"
48 1.1 christos
49 1.1 christos #define UCS2_REPLACEMENT_CHARACTER 0xfffd
50 1.1 christos
51 1.1 christos /************************************************************************
52 1.1 christos * Character encoding conversion routnes
53 1.1 christos *
54 1.1 christos * UEFI uses UCS-2 character encoding. Note that this is not UTF-16
55 1.1 christos * as it doesn't interpret surrogate pairs.
56 1.1 christos *
57 1.1 christos * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>:
58 1.1 christos *
59 1.1 christos * "UCS-2 is obsolete terminology which refers to a Unicode
60 1.1 christos * implementation up to Unicode 1.1, before surrogate code points and
61 1.1 christos * UTF-16 were added to Version 2.0 of the standard. This term should
62 1.1 christos * now be avoided.
63 1.1 christos *
64 1.1 christos * UCS-2 does not describe a data format distinct from UTF-16, because
65 1.1 christos * both use exactly the same 16-bit code unit
66 1.1 christos * representations. However, UCS-2 does not interpret surrogate code
67 1.1 christos * points, and thus cannot be used to conformantly represent
68 1.1 christos * supplementary characters.
69 1.1 christos *
70 1.1 christos * Sometimes in the past an implementation has been labeled "UCS-2" to
71 1.1 christos * indicate that it does not support supplementary characters and
72 1.1 christos * doesn't interpret pairs of surrogate code points as
73 1.1 christos * characters. Such an implementation would not handle processing of
74 1.1 christos * character properties, code point boundaries, collation, etc. for
75 1.1 christos * supplementary characters, nor would it be able to support most
76 1.1 christos * emoji, for example. [AF]"
77 1.1 christos *
78 1.1 christos * Regarding illegal UTF-8 sequences, the same document says:
79 1.1 christos *
80 1.1 christos * "None of the UTFs can generate every arbitrary byte sequence. For
81 1.1 christos * example, in UTF-8 every byte of the form 110xxxxx_2 must be
82 1.1 christos * followed with a byte of the form 10xxxxxx_2. A sequence such as
83 1.1 christos * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be
84 1.1 christos * generated. When faced with this illegal byte sequence while
85 1.1 christos * transforming or interpreting, a UTF-8 conformant process must treat
86 1.1 christos * the first byte 110xxxxx_2 as an illegal termination error: for
87 1.1 christos * example, either signaling an error, filtering the byte out, or
88 1.1 christos * representing the byte with a marker such as U+FFFD REPLACEMENT
89 1.1 christos * CHARACTER. In the latter two cases, it will continue processing at
90 1.1 christos * the second byte 0xxxxxxx_2.
91 1.1 christos *
92 1.1 christos * A conformant process must not interpret illegal or ill-formed byte
93 1.1 christos * sequences as characters, however, it may take error recovery
94 1.1 christos * actions. No conformant process may use irregular byte sequences to
95 1.1 christos * encode out-of-band information."
96 1.1 christos */
97 1.1 christos
98 1.1 christos /*
99 1.1 christos * ibuf = input buffer (uint16_t *)
100 1.1 christos * isz = bytes in input buffer
101 1.1 christos * obuf = output buffer (char *)
102 1.1 christos * osz = bytes in output buffer (size_t *)
103 1.1 christos *
104 1.1 christos * if (obuf == NULL), malloc obuf.
105 1.1 christos * if (obuf != NULL), write to existing output buffer.
106 1.1 christos *
107 1.1 christos * Return resulting utf8 string.
108 1.1 christos */
109 1.1 christos PUBLIC char *
110 1.1 christos ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz)
111 1.1 christos {
112 1.1 christos uint8_t *dst;
113 1.1 christos size_t dsz, i, j, j_max, n;
114 1.1 christos uint16_t c;
115 1.1 christos
116 1.1 christos assert(isz > 0);
117 1.1 christos
118 1.1 christos if (obuf != NULL) {
119 1.1 christos assert(osz != NULL);
120 1.1 christos dsz = *osz;
121 1.1 christos dst = (uint8_t *)obuf;
122 1.1 christos }
123 1.1 christos else {
124 1.1 christos dsz = isz * sizeof(*dst);
125 1.1 christos dst = malloc(dsz);
126 1.1 christos }
127 1.1 christos
128 1.1 christos /*
129 1.1 christos * Each UCS2 character will encode as at most 3 UTF8 bytes.
130 1.1 christos * 'isz' is the number of bytes in the source buffer which
131 1.1 christos * may well be larger than the UCS2 string. 'osz' is the
132 1.1 christos * actual number of bytes in the NUL terminated USC2 string.
133 1.1 christos */
134 1.1 christos n = isz / sizeof(*ibuf); /* max number of characters in input buffer */
135 1.1 christos j = 0;
136 1.1 christos j_max = dsz / sizeof(*dst);
137 1.1 christos for (i = 0; i < n; i++) {
138 1.1 christos c = le16toh(ibuf[i]);
139 1.1 christos if (c == 0) {
140 1.1 christos break;
141 1.1 christos }
142 1.1 christos if (c < 0x0080) {
143 1.1 christos if (j + 1 >= j_max)
144 1.1 christos break;
145 1.1 christos dst[j++] = (uint8_t)c;
146 1.1 christos }
147 1.1 christos else if (c < 0x0800) {
148 1.1 christos if (j + 2 >= j_max)
149 1.1 christos break;
150 1.1 christos dst[j++] = 0xc0 | (uint8_t)(c >> 6);
151 1.1 christos dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
152 1.1 christos }
153 1.1 christos else {
154 1.1 christos if (j + 3 >= j_max)
155 1.1 christos break;
156 1.1 christos dst[j++] = 0xe0 | (uint8_t)(c >> 12);
157 1.1 christos dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f);
158 1.1 christos dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
159 1.1 christos }
160 1.1 christos }
161 1.1 christos if (dst != NULL)
162 1.1 christos dst[j] = '\0';
163 1.1 christos
164 1.1 christos if (osz)
165 1.1 christos *osz = j;
166 1.1 christos
167 1.1 christos return (char *)dst;
168 1.1 christos }
169 1.1 christos
170 1.1 christos /*
171 1.1 christos * ibuf = input buffer (char *)
172 1.1 christos * isz = bytes in input buffer
173 1.1 christos * obuf = output buffer (uint16_t *)
174 1.1 christos * osz = bytes in output buffer (size_t *)
175 1.1 christos *
176 1.1 christos * if (obuf == NULL), malloc obuf.
177 1.1 christos * if (obuf != NULL), write to existing output buffer.
178 1.1 christos *
179 1.1 christos * Return resulting ucs2 string.
180 1.1 christos */
181 1.1 christos PUBLIC uint16_t *
182 1.1 christos utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz)
183 1.1 christos {
184 1.1 christos const uint8_t *src = (const uint8_t *)ibuf;
185 1.1 christos uint16_t *dst;
186 1.1 christos uint16_t out;
187 1.1 christos size_t dsz, i, j, j_max;
188 1.1 christos
189 1.1 christos if (obuf != NULL) {
190 1.1 christos assert(osz != NULL);
191 1.1 christos dst = obuf;
192 1.1 christos dsz = *osz;
193 1.1 christos }
194 1.1 christos else {
195 1.1 christos dsz = isz * sizeof(*dst);
196 1.1 christos dst = malloc(dsz);
197 1.1 christos }
198 1.1 christos
199 1.1 christos j = 0;
200 1.1 christos j_max = dsz / sizeof(*dst);
201 1.1 christos for (i = 0; i < isz; i++) {
202 1.1 christos out = src[i];
203 1.1 christos if (out == '\0') {
204 1.1 christos break;
205 1.1 christos }
206 1.1 christos else if (j + 1 >= j_max) {
207 1.1 christos break;
208 1.1 christos }
209 1.1 christos else if ((out & 0x80) == 0) {
210 1.1 christos /* we're good to go */
211 1.1 christos }
212 1.1 christos else if ((out & 0xe0) == 0xc0) {
213 1.1 christos if (i + 1 >= isz) { /* insufficient source */
214 1.1 christos break;
215 1.1 christos }
216 1.1 christos if ((src[i + 1] & 0xc0) != 0x80) {
217 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
218 1.1 christos }
219 1.1 christos else {
220 1.1 christos out &= 0x1f;
221 1.1 christos out <<= 6;
222 1.1 christos out |= src[++i] & 0x3f;
223 1.1 christos }
224 1.1 christos }
225 1.1 christos else if ((out & 0xf0) == 0xe0) {
226 1.1 christos if (i + 2 >= isz) { /* insufficient source */
227 1.1 christos break;
228 1.1 christos }
229 1.1 christos if ((src[i + 1] & 0xc0) != 0x80 ||
230 1.1 christos (src[i + 2] & 0xc0) != 0x80) {
231 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
232 1.1 christos }
233 1.1 christos else {
234 1.1 christos out &= 0x0f;
235 1.1 christos out <<= 6;
236 1.1 christos out |= src[++i] & 0x3f;
237 1.1 christos out <<= 6;
238 1.1 christos out |= src[++i] & 0x3f;
239 1.1 christos }
240 1.1 christos }
241 1.1 christos else { /* cannot encode as USC2 */
242 1.1 christos out = UCS2_REPLACEMENT_CHARACTER;
243 1.1 christos }
244 1.1 christos dst[j++] = htole16(out);
245 1.1 christos }
246 1.1 christos dst[j] = '\0';
247 1.1 christos
248 1.1 christos if (src[i] != '\0')
249 1.1 christos warnx("bad UTF8 string");
250 1.1 christos
251 1.1 christos if (osz)
252 1.1 christos *osz = (j + 1) * sizeof(*dst);
253 1.1 christos return dst;
254 1.1 christos }
255 1.1 christos
256 1.1 christos PUBLIC size_t
257 1.1 christos utf8_to_ucs2_size(const char *src)
258 1.1 christos {
259 1.1 christos #if 0
260 1.1 christos uint16_t *dst;
261 1.1 christos size_t sz;
262 1.1 christos
263 1.1 christos dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz);
264 1.1 christos free(dst);
265 1.1 christos return sz;
266 1.1 christos #else
267 1.1 christos const uint8_t *buf = (const uint8_t *)src;
268 1.1 christos uint out;
269 1.1 christos size_t i, j;
270 1.1 christos
271 1.1 christos j = 0;
272 1.1 christos for (i = 0; (out = buf[i]) != '\0'; i++) {
273 1.1 christos if ((out & 0x80) == 0) {
274 1.1 christos /* we're good to go */
275 1.1 christos }
276 1.1 christos else if ((out & 0xe0) == 0xc0) {
277 1.1 christos if ((buf[i + 1] & 0xc0) == 0x80) {
278 1.1 christos i++;
279 1.1 christos }
280 1.1 christos }
281 1.1 christos else if ((out & 0xf0) == 0xe0) {
282 1.1 christos if ((buf[i + 1] & 0xc0) == 0x80 &&
283 1.1 christos (buf[i + 2] & 0xc0) == 0x80) {
284 1.1 christos i += 2;
285 1.1 christos }
286 1.1 christos }
287 1.1 christos j++;
288 1.1 christos }
289 1.1 christos
290 1.1 christos return (j + 1) * sizeof(uint16_t);
291 1.1 christos #endif
292 1.1 christos }
293 1.1 christos
294 1.1 christos /************************************************************************
295 1.1 christos * UUID routines
296 1.1 christos */
297 1.1 christos PUBLIC int
298 1.1 christos uuid_scanf(struct uuid *uuid, const char *str)
299 1.1 christos {
300 1.1 christos
301 1.1 christos return sscanf(str,
302 1.1 christos "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
303 1.1 christos &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version,
304 1.1 christos &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low,
305 1.1 christos &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3],
306 1.1 christos &uuid->node[4], &uuid->node[5]);
307 1.1 christos }
308 1.1 christos
309 1.1 christos /*
310 1.1 christos * from sys/kern/kern_uuid.c
311 1.1 christos */
312 1.1 christos PUBLIC int
313 1.1 christos uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid)
314 1.1 christos {
315 1.1 christos
316 1.1 christos return snprintf(buf, sz,
317 1.1 christos "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
318 1.1 christos uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
319 1.1 christos uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
320 1.1 christos uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3],
321 1.1 christos uuid->node[4], uuid->node[5]);
322 1.1 christos }
323 1.1 christos
324 1.1 christos PUBLIC int
325 1.1 christos uuid_printf(const struct uuid *uuid)
326 1.1 christos {
327 1.1 christos char buf[UUID_STR_LEN];
328 1.1 christos
329 1.1 christos (void) uuid_snprintf(buf, sizeof(buf), uuid);
330 1.1 christos printf("%s", buf);
331 1.1 christos return (0);
332 1.1 christos }
333 1.1 christos
334 1.1 christos /************************************************************************
335 1.1 christos * Misc routines
336 1.1 christos */
337 1.1 christos
338 1.1 christos PUBLIC void
339 1.1 christos show_data(const uint8_t *buf, size_t len, const char *prefix)
340 1.1 christos {
341 1.1 christos uint8_t line_buf[17];
342 1.1 christos size_t i;
343 1.1 christos
344 1.1 christos line_buf[16] = '\0';
345 1.1 christos for (i = 0; i < len; i++) {
346 1.1 christos if ((i & 0xf) == 0)
347 1.1 christos printf("%s%08zx: ", prefix, i);
348 1.1 christos line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.';
349 1.1 christos printf("%02x ", buf[i]);
350 1.1 christos if ((i & 0xf) == 0xf)
351 1.1 christos printf(" %s\n", line_buf);
352 1.1 christos else if ((i & 0x7) == 0x7)
353 1.1 christos printf(" ");
354 1.1 christos }
355 1.1 christos i &= 0xf;
356 1.1 christos if (i != 0) {
357 1.1 christos line_buf[i] = '\0';
358 1.1 christos if (i < 8)
359 1.1 christos printf(" ");
360 1.1 christos while (i++ < 16)
361 1.1 christos printf(" ");
362 1.1 christos
363 1.1 christos printf(" %s\n", line_buf);
364 1.1 christos }
365 1.1 christos }
366 1.1 christos
367 1.1 christos PUBLIC uint16_t
368 1.1 christos strtous(const char *str, char **endptr, int base)
369 1.1 christos {
370 1.1 christos uintmax_t val;
371 1.1 christos int rstatus;
372 1.1 christos
373 1.1 christos val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus);
374 1.1 christos
375 1.1 christos switch (rstatus) {
376 1.1 christos case EINVAL:
377 1.1 christos assert(0);
378 1.1 christos break;
379 1.1 christos case ENOTSUP:
380 1.1 christos if (endptr != NULL)
381 1.1 christos break;
382 1.1 christos /*FALLTHROUGH*/
383 1.1 christos case ECANCELED:
384 1.1 christos err(EXIT_FAILURE, "invalid numeric string: %s\n", str);
385 1.1 christos case ERANGE:
386 1.1 christos err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n",
387 1.1 christos USHRT_MAX, str);
388 1.1 christos default:
389 1.1 christos break;
390 1.1 christos }
391 1.1 christos
392 1.1 christos return (uint16_t)val;
393 1.1 christos }
394 1.1 christos
395 1.1 christos char *
396 1.1 christos read_file(const char *fname, size_t *size)
397 1.1 christos {
398 1.1 christos char *buf, *cp, *ep;
399 1.1 christos size_t bufsz, cnt, sz;
400 1.1 christos ssize_t ssz;
401 1.1 christos int fd, fd_flags;
402 1.1 christos
403 1.1 christos assert(fname != NULL);
404 1.1 christos if (fname == NULL)
405 1.1 christos return 0;
406 1.1 christos
407 1.1 christos if (strcmp(fname, "-") == 0) {
408 1.1 christos fd = STDIN_FILENO;
409 1.1 christos if ((fd_flags = fcntl(fd, F_GETFL)) == -1)
410 1.1 christos err(EXIT_FAILURE, "fcntl F_GETFL");
411 1.1 christos
412 1.1 christos if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1)
413 1.1 christos err(EXIT_FAILURE, "fcntl F_SETFL");
414 1.1 christos }
415 1.1 christos else {
416 1.1 christos fd_flags = -1;
417 1.1 christos fd = open(fname, O_RDONLY);
418 1.1 christos if (fd == -1)
419 1.1 christos err(EXIT_FAILURE, "open");
420 1.1 christos }
421 1.1 christos
422 1.1 christos bufsz = 0x800;
423 1.1 christos buf = emalloc(bufsz);
424 1.1 christos cp = buf;
425 1.1 christos ep = buf + bufsz;
426 1.1 christos cnt = 0;
427 1.1 christos for (;;) {
428 1.1 christos ssz = read(fd, cp, (size_t)(ep - cp));
429 1.1 christos if (ssz == -1) {
430 1.1 christos if (errno == EAGAIN)
431 1.1 christos continue;
432 1.1 christos err(EXIT_FAILURE, "read");
433 1.1 christos }
434 1.1 christos assert(ssz >= 0);
435 1.1 christos #if 0
436 1.1 christos printf("ssz: %zd\n", ssz);
437 1.1 christos show_data((uint8_t *)cp, (size_t)ssz, "");
438 1.1 christos #endif
439 1.1 christos if (ssz == 0)
440 1.1 christos break;
441 1.1 christos
442 1.1 christos cp += ssz;
443 1.1 christos sz = (size_t)ssz;
444 1.1 christos cnt += sz;
445 1.1 christos
446 1.1 christos if (cp < ep) {
447 1.1 christos /* XXX: what about UCS-2? */
448 1.1 christos *cp = '\0';
449 1.1 christos cnt++;
450 1.1 christos break;
451 1.1 christos }
452 1.1 christos
453 1.1 christos if (cp == ep) {
454 1.1 christos bufsz *= 2;
455 1.1 christos buf = erealloc(buf, bufsz);
456 1.1 christos cp = buf + cnt;
457 1.1 christos ep = buf + bufsz;
458 1.1 christos }
459 1.1 christos }
460 1.1 christos if (fd_flags != -1)
461 1.1 christos fcntl(fd, F_SETFL, fd_flags);
462 1.1 christos else
463 1.1 christos close(fd);
464 1.1 christos
465 1.1 christos *size = cnt;
466 1.1 christos return buf;
467 1.1 christos }
468