utils.c revision 1.3 1 /* $NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $ */
2
3 /*
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
14 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/cdefs.h>
27 #ifndef lint
28 __RCSID("$NetBSD: utils.c,v 1.3 2025/03/02 00:03:41 riastradh Exp $");
29 #endif /* not lint */
30
31 #include <sys/uuid.h>
32
33 #include <assert.h>
34 #include <ctype.h>
35 #include <err.h>
36 #include <errno.h>
37 #include <inttypes.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <fcntl.h>
43 #include <unistd.h>
44 #include <util.h>
45
46 #include "defs.h"
47 #include "utils.h"
48
49 #define UCS2_REPLACEMENT_CHARACTER 0xfffd
50
51 /************************************************************************
52 * Character encoding conversion routnes
53 *
54 * UEFI uses UCS-2 character encoding. Note that this is not UTF-16
55 * as it doesn't interpret surrogate pairs.
56 *
57 * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>:
58 *
59 * "UCS-2 is obsolete terminology which refers to a Unicode
60 * implementation up to Unicode 1.1, before surrogate code points and
61 * UTF-16 were added to Version 2.0 of the standard. This term should
62 * now be avoided.
63 *
64 * UCS-2 does not describe a data format distinct from UTF-16, because
65 * both use exactly the same 16-bit code unit
66 * representations. However, UCS-2 does not interpret surrogate code
67 * points, and thus cannot be used to conformantly represent
68 * supplementary characters.
69 *
70 * Sometimes in the past an implementation has been labeled "UCS-2" to
71 * indicate that it does not support supplementary characters and
72 * doesn't interpret pairs of surrogate code points as
73 * characters. Such an implementation would not handle processing of
74 * character properties, code point boundaries, collation, etc. for
75 * supplementary characters, nor would it be able to support most
76 * emoji, for example. [AF]"
77 *
78 * Regarding illegal UTF-8 sequences, the same document says:
79 *
80 * "None of the UTFs can generate every arbitrary byte sequence. For
81 * example, in UTF-8 every byte of the form 110xxxxx_2 must be
82 * followed with a byte of the form 10xxxxxx_2. A sequence such as
83 * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be
84 * generated. When faced with this illegal byte sequence while
85 * transforming or interpreting, a UTF-8 conformant process must treat
86 * the first byte 110xxxxx_2 as an illegal termination error: for
87 * example, either signaling an error, filtering the byte out, or
88 * representing the byte with a marker such as U+FFFD REPLACEMENT
89 * CHARACTER. In the latter two cases, it will continue processing at
90 * the second byte 0xxxxxxx_2.
91 *
92 * A conformant process must not interpret illegal or ill-formed byte
93 * sequences as characters, however, it may take error recovery
94 * actions. No conformant process may use irregular byte sequences to
95 * encode out-of-band information."
96 */
97
98 /*
99 * ibuf = input buffer (uint16_t *)
100 * isz = bytes in input buffer
101 * obuf = output buffer (char *)
102 * osz = bytes in output buffer (size_t *)
103 *
104 * if (obuf == NULL), malloc obuf.
105 * if (obuf != NULL), write to existing output buffer.
106 *
107 * Return resulting utf8 string.
108 */
109 PUBLIC char *
110 ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz)
111 {
112 uint8_t *dst;
113 size_t dsz, i, j, j_max, n;
114 uint16_t c;
115
116 assert(isz > 0);
117
118 if (obuf != NULL) {
119 assert(osz != NULL);
120 dsz = *osz;
121 dst = (uint8_t *)obuf;
122 }
123 else {
124 dsz = isz * sizeof(*dst);
125 dst = malloc(dsz);
126 }
127
128 /*
129 * Each UCS2 character will encode as at most 3 UTF8 bytes.
130 * 'isz' is the number of bytes in the source buffer which
131 * may well be larger than the UCS2 string. 'osz' is the
132 * actual number of bytes in the NUL terminated USC2 string.
133 */
134 n = isz / sizeof(*ibuf); /* max number of characters in input buffer */
135 j = 0;
136 j_max = dsz / sizeof(*dst);
137 for (i = 0; i < n; i++) {
138 c = le16toh(ibuf[i]);
139 if (c == 0) {
140 break;
141 }
142 if (c < 0x0080) {
143 if (j + 1 >= j_max)
144 break;
145 dst[j++] = (uint8_t)c;
146 }
147 else if (c < 0x0800) {
148 if (j + 2 >= j_max)
149 break;
150 dst[j++] = 0xc0 | (uint8_t)(c >> 6);
151 dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
152 }
153 else {
154 if (j + 3 >= j_max)
155 break;
156 dst[j++] = 0xe0 | (uint8_t)(c >> 12);
157 dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f);
158 dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
159 }
160 }
161 if (dst != NULL)
162 dst[j] = '\0';
163
164 if (osz)
165 *osz = j;
166
167 return (char *)dst;
168 }
169
170 /*
171 * ibuf = input buffer (char *)
172 * isz = bytes in input buffer
173 * obuf = output buffer (uint16_t *)
174 * osz = bytes in output buffer (size_t *)
175 *
176 * if (obuf == NULL), malloc obuf.
177 * if (obuf != NULL), write to existing output buffer.
178 *
179 * Return resulting ucs2 string.
180 */
181 PUBLIC uint16_t *
182 utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz)
183 {
184 const uint8_t *src = (const uint8_t *)ibuf;
185 uint16_t *dst;
186 uint16_t out;
187 size_t dsz, i, j, j_max;
188
189 if (obuf != NULL) {
190 assert(osz != NULL);
191 dst = obuf;
192 dsz = *osz;
193 }
194 else {
195 dsz = isz * sizeof(*dst);
196 dst = malloc(dsz);
197 }
198
199 j = 0;
200 j_max = dsz / sizeof(*dst);
201 for (i = 0; i < isz; i++) {
202 out = src[i];
203 if (out == '\0') {
204 break;
205 }
206 else if (j + 1 >= j_max) {
207 break;
208 }
209 else if ((out & 0x80) == 0) {
210 /* we're good to go */
211 }
212 else if ((out & 0xe0) == 0xc0) {
213 if (i + 1 >= isz) { /* insufficient source */
214 break;
215 }
216 if ((src[i + 1] & 0xc0) != 0x80) {
217 out = UCS2_REPLACEMENT_CHARACTER;
218 }
219 else {
220 out &= 0x1f;
221 out <<= 6;
222 out |= src[++i] & 0x3f;
223 }
224 }
225 else if ((out & 0xf0) == 0xe0) {
226 if (i + 2 >= isz) { /* insufficient source */
227 break;
228 }
229 if ((src[i + 1] & 0xc0) != 0x80 ||
230 (src[i + 2] & 0xc0) != 0x80) {
231 out = UCS2_REPLACEMENT_CHARACTER;
232 }
233 else {
234 out &= 0x0f;
235 out <<= 6;
236 out |= src[++i] & 0x3f;
237 out <<= 6;
238 out |= src[++i] & 0x3f;
239 }
240 }
241 else { /* cannot encode as USC2 */
242 out = UCS2_REPLACEMENT_CHARACTER;
243 }
244 dst[j++] = htole16(out);
245 }
246 dst[j] = '\0';
247
248 if (src[i] != '\0')
249 warnx("bad UTF8 string");
250
251 if (osz)
252 *osz = (j + 1) * sizeof(*dst);
253 return dst;
254 }
255
256 PUBLIC size_t
257 utf8_to_ucs2_size(const char *src)
258 {
259 #if 0
260 uint16_t *dst;
261 size_t sz;
262
263 dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz);
264 free(dst);
265 return sz;
266 #else
267 const uint8_t *buf = (const uint8_t *)src;
268 uint out;
269 size_t i, j;
270
271 j = 0;
272 for (i = 0; (out = buf[i]) != '\0'; i++) {
273 if ((out & 0x80) == 0) {
274 /* we're good to go */
275 }
276 else if ((out & 0xe0) == 0xc0) {
277 if ((buf[i + 1] & 0xc0) == 0x80) {
278 i++;
279 }
280 }
281 else if ((out & 0xf0) == 0xe0) {
282 if ((buf[i + 1] & 0xc0) == 0x80 &&
283 (buf[i + 2] & 0xc0) == 0x80) {
284 i += 2;
285 }
286 }
287 j++;
288 }
289
290 return (j + 1) * sizeof(uint16_t);
291 #endif
292 }
293
294 /************************************************************************
295 * UUID routines
296 */
297 PUBLIC int
298 uuid_scanf(struct uuid *uuid, const char *str)
299 {
300
301 return sscanf(str,
302 "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
303 &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version,
304 &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low,
305 &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3],
306 &uuid->node[4], &uuid->node[5]);
307 }
308
309 /*
310 * from sys/kern/kern_uuid.c
311 */
312 PUBLIC int
313 uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid)
314 {
315
316 return snprintf(buf, sz,
317 "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
318 uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
319 uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
320 uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3],
321 uuid->node[4], uuid->node[5]);
322 }
323
324 PUBLIC int
325 uuid_printf(const struct uuid *uuid)
326 {
327 char buf[UUID_STR_LEN];
328
329 (void) uuid_snprintf(buf, sizeof(buf), uuid);
330 printf("%s", buf);
331 return (0);
332 }
333
334 /************************************************************************
335 * Misc routines
336 */
337
338 PUBLIC void
339 show_data(const uint8_t *buf, size_t len, const char *prefix)
340 {
341 uint8_t line_buf[17];
342 size_t i;
343
344 line_buf[16] = '\0';
345 for (i = 0; i < len; i++) {
346 if ((i & 0xf) == 0)
347 printf("%s%08zx: ", prefix, i);
348 line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.';
349 printf("%02x ", buf[i]);
350 if ((i & 0xf) == 0xf)
351 printf(" %s\n", line_buf);
352 else if ((i & 0x7) == 0x7)
353 printf(" ");
354 }
355 i &= 0xf;
356 if (i != 0) {
357 line_buf[i] = '\0';
358 if (i < 8)
359 printf(" ");
360 while (i++ < 16)
361 printf(" ");
362
363 printf(" %s\n", line_buf);
364 }
365 }
366
367 PUBLIC uint16_t
368 strtous(const char *str, char **endptr, int base)
369 {
370 uintmax_t val;
371 int rstatus;
372
373 val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus);
374
375 switch (rstatus) {
376 case EINVAL:
377 assert(0);
378 break;
379 case ENOTSUP:
380 if (endptr != NULL)
381 break;
382 /*FALLTHROUGH*/
383 case ECANCELED:
384 err(EXIT_FAILURE, "invalid numeric string: %s\n", str);
385 case ERANGE:
386 err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n",
387 USHRT_MAX, str);
388 default:
389 break;
390 }
391
392 return (uint16_t)val;
393 }
394
395 char *
396 read_file(const char *fname, size_t *size)
397 {
398 char *buf, *cp, *ep;
399 size_t bufsz, cnt, sz;
400 ssize_t ssz;
401 int fd, fd_flags;
402
403 assert(fname != NULL);
404 if (fname == NULL)
405 return 0;
406
407 if (strcmp(fname, "-") == 0) {
408 fd = STDIN_FILENO;
409 if ((fd_flags = fcntl(fd, F_GETFL)) == -1)
410 err(EXIT_FAILURE, "fcntl F_GETFL");
411
412 if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1)
413 err(EXIT_FAILURE, "fcntl F_SETFL");
414 }
415 else {
416 fd_flags = -1;
417 fd = open(fname, O_RDONLY);
418 if (fd == -1)
419 err(EXIT_FAILURE, "open");
420 }
421
422 bufsz = 0x800;
423 buf = emalloc(bufsz);
424 cp = buf;
425 ep = buf + bufsz;
426 cnt = 0;
427 for (;;) {
428 ssz = read(fd, cp, (size_t)(ep - cp));
429 if (ssz == -1) {
430 if (errno == EAGAIN)
431 continue;
432 err(EXIT_FAILURE, "read");
433 }
434 assert(ssz >= 0);
435 #if 0
436 printf("ssz: %zd\n", ssz);
437 show_data((uint8_t *)cp, (size_t)ssz, "");
438 #endif
439 if (ssz == 0)
440 break;
441
442 cp += ssz;
443 sz = (size_t)ssz;
444 cnt += sz;
445
446 if (cp < ep) {
447 /* XXX: what about UCS-2? */
448 *cp = '\0';
449 cnt++;
450 break;
451 }
452
453 if (cp == ep) {
454 bufsz *= 2;
455 buf = erealloc(buf, bufsz);
456 cp = buf + cnt;
457 ep = buf + bufsz;
458 }
459 }
460 if (fd_flags != -1)
461 fcntl(fd, F_SETFL, fd_flags);
462 else
463 close(fd);
464
465 *size = cnt;
466 return buf;
467 }
468