utils.c revision 1.1 1 /* $NetBSD: utils.c,v 1.1 2025/02/24 13:47:57 christos Exp $ */
2
3 /*
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
14 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/cdefs.h>
27 #ifndef lint
28 __RCSID("$NetBSD: utils.c,v 1.1 2025/02/24 13:47:57 christos Exp $");
29 #endif /* not lint */
30
31 #include <sys/uuid.h>
32
33 #include <assert.h>
34 #include <ctype.h>
35 #include <err.h>
36 #include <errno.h>
37 #include <inttypes.h>
38 #include <limits.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <fcntl.h>
43 #include <unistd.h>
44 #include <util.h>
45
46 #include "defs.h"
47 #include "utils.h"
48
49 #define UCS2_REPLACEMENT_CHARACTER 0xfffd
50
51 /************************************************************************
52 * Character encoding conversion routnes
53 *
54 * UEFI uses UCS-2 character encoding. Note that this is not UTF-16
55 * as it doesn't interpret surrogate pairs.
56 *
57 * From <https://www.unicode.org/faq/utf_bom.html#utf16-11>:
58 *
59 * "UCS-2 is obsolete terminology which refers to a Unicode
60 * implementation up to Unicode 1.1, before surrogate code points and
61 * UTF-16 were added to Version 2.0 of the standard. This term should
62 * now be avoided.
63 *
64 * UCS-2 does not describe a data format distinct from UTF-16, because
65 * both use exactly the same 16-bit code unit
66 * representations. However, UCS-2 does not interpret surrogate code
67 * points, and thus cannot be used to conformantly represent
68 * supplementary characters.
69 *
70 * Sometimes in the past an implementation has been labeled "UCS-2" to
71 * indicate that it does not support supplementary characters and
72 * doesn't interpret pairs of surrogate code points as
73 * characters. Such an implementation would not handle processing of
74 * character properties, code point boundaries, collation, etc. for
75 * supplementary characters, nor would it be able to support most
76 * emoji, for example. [AF]"
77 *
78 * Regarding illegal UTF-8 sequences, the same document says:
79 *
80 * "None of the UTFs can generate every arbitrary byte sequence. For
81 * example, in UTF-8 every byte of the form 110xxxxx_2 must be
82 * followed with a byte of the form 10xxxxxx_2. A sequence such as
83 * <110xxxxx_2 0xxxxxxx_2> is illegal, and must never be
84 * generated. When faced with this illegal byte sequence while
85 * transforming or interpreting, a UTF-8 conformant process must treat
86 * the first byte 110xxxxx_2 as an illegal termination error: for
87 * example, either signaling an error, filtering the byte out, or
88 * representing the byte with a marker such as U+FFFD REPLACEMENT
89 * CHARACTER. In the latter two cases, it will continue processing at
90 * the second byte 0xxxxxxx_2.
91 *
92 * A conformant process must not interpret illegal or ill-formed byte
93 * sequences as characters, however, it may take error recovery
94 * actions. No conformant process may use irregular byte sequences to
95 * encode out-of-band information."
96 */
97
98 /*
99 * ibuf = input buffer (uint16_t *)
100 * isz = bytes in input buffer
101 * obuf = output buffer (char *)
102 * osz = bytes in output buffer (size_t *)
103 *
104 * if (obuf == NULL), malloc obuf.
105 * if (obuf != NULL), write to existing output buffer.
106 *
107 * Return resulting utf8 string.
108 */
109 PUBLIC char *
110 ucs2_to_utf8(const uint16_t *ibuf, size_t isz, char *obuf, size_t *osz)
111 {
112 uint8_t *dst;
113 size_t dsz, i, j, j_max, n;
114 uint16_t c;
115
116 assert(isz > 0);
117
118 if (obuf != NULL) {
119 assert(osz != NULL);
120 dsz = *osz;
121 dst = (uint8_t *)obuf;
122 }
123 else {
124 dsz = isz * sizeof(*dst);
125 dst = malloc(dsz);
126 }
127
128 /*
129 * Each UCS2 character will encode as at most 3 UTF8 bytes.
130 * 'isz' is the number of bytes in the source buffer which
131 * may well be larger than the UCS2 string. 'osz' is the
132 * actual number of bytes in the NUL terminated USC2 string.
133 */
134 n = isz / sizeof(*ibuf); /* max number of characters in input buffer */
135 j = 0;
136 j_max = dsz / sizeof(*dst);
137 for (i = 0; i < n; i++) {
138 c = le16toh(ibuf[i]);
139 if (c == 0) {
140 break;
141 }
142 if (c < 0x0080) {
143 if (j + 1 >= j_max)
144 break;
145 dst[j++] = (uint8_t)c;
146 }
147 else if (c < 0x0800) {
148 if (j + 2 >= j_max)
149 break;
150 dst[j++] = 0xc0 | (uint8_t)(c >> 6);
151 dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
152 }
153 else {
154 if (j + 3 >= j_max)
155 break;
156 dst[j++] = 0xe0 | (uint8_t)(c >> 12);
157 dst[j++] = 0x80 | (uint8_t)((c >> 6) & 0x3f);
158 dst[j++] = 0x80 | (uint8_t)(c & 0x3f);
159 }
160 }
161 if (dst != NULL)
162 dst[j] = '\0';
163
164 if (osz)
165 *osz = j;
166
167 return (char *)dst;
168 }
169
170 /*
171 * ibuf = input buffer (char *)
172 * isz = bytes in input buffer
173 * obuf = output buffer (uint16_t *)
174 * osz = bytes in output buffer (size_t *)
175 *
176 * if (obuf == NULL), malloc obuf.
177 * if (obuf != NULL), write to existing output buffer.
178 *
179 * Return resulting ucs2 string.
180 */
181 PUBLIC uint16_t *
182 utf8_to_ucs2(const char *ibuf, size_t isz, uint16_t *obuf, size_t *osz)
183 {
184 const uint8_t *src = (const uint8_t *)ibuf;
185 uint16_t *dst;
186 uint16_t out;
187 size_t dsz, i, j, j_max;
188
189 if (obuf != NULL) {
190 assert(osz != NULL);
191 dst = obuf;
192 dsz = *osz;
193 }
194 else {
195 dsz = isz * sizeof(*dst);
196 dst = malloc(dsz);
197 }
198
199 j = 0;
200 j_max = dsz / sizeof(*dst);
201 for (i = 0; i < isz; i++) {
202 out = src[i];
203 if (out == '\0') {
204 break;
205 }
206 else if (j + 1 >= j_max) {
207 break;
208 }
209 else if ((out & 0x80) == 0) {
210 /* we're good to go */
211 }
212 else if ((out & 0xe0) == 0xc0) {
213 if (i + 1 >= isz) { /* insufficient source */
214 break;
215 }
216 if ((src[i + 1] & 0xc0) != 0x80) {
217 out = UCS2_REPLACEMENT_CHARACTER;
218 }
219 else {
220 out &= 0x1f;
221 out <<= 6;
222 out |= src[++i] & 0x3f;
223 }
224 }
225 else if ((out & 0xf0) == 0xe0) {
226 if (i + 2 >= isz) { /* insufficient source */
227 break;
228 }
229 if ((src[i + 1] & 0xc0) != 0x80 ||
230 (src[i + 2] & 0xc0) != 0x80) {
231 out = UCS2_REPLACEMENT_CHARACTER;
232 }
233 else {
234 out &= 0x0f;
235 out <<= 6;
236 out |= src[++i] & 0x3f;
237 out <<= 6;
238 out |= src[++i] & 0x3f;
239 }
240 }
241 else { /* cannot encode as USC2 */
242 out = UCS2_REPLACEMENT_CHARACTER;
243 }
244 dst[j++] = htole16(out);
245 }
246 dst[j] = '\0';
247
248 if (src[i] != '\0')
249 warnx("bad UTF8 string");
250
251 if (osz)
252 *osz = (j + 1) * sizeof(*dst);
253 return dst;
254 }
255
256
257 PUBLIC size_t
258 utf8_to_ucs2_size(const char *src)
259 {
260 #if 0
261 uint16_t *dst;
262 size_t sz;
263
264 dst = utf8_to_ucs2(src, strlen(src) + 1, NULL, &sz);
265 free(dst);
266 return sz;
267 #else
268 const uint8_t *buf = (const uint8_t *)src;
269 uint out;
270 size_t i, j;
271
272 j = 0;
273 for (i = 0; (out = buf[i]) != '\0'; i++) {
274 if ((out & 0x80) == 0) {
275 /* we're good to go */
276 }
277 else if ((out & 0xe0) == 0xc0) {
278 if ((buf[i + 1] & 0xc0) == 0x80) {
279 i++;
280 }
281 }
282 else if ((out & 0xf0) == 0xe0) {
283 if ((buf[i + 1] & 0xc0) == 0x80 &&
284 (buf[i + 2] & 0xc0) == 0x80) {
285 i += 2;
286 }
287 }
288 j++;
289 }
290
291 return (j + 1) * sizeof(uint16_t);
292 #endif
293 }
294
295 /************************************************************************
296 * UUID routines
297 */
298 PUBLIC int
299 uuid_scanf(struct uuid *uuid, const char *str)
300 {
301
302 return sscanf(str,
303 "%08x-%04hx-%04hx-%02hhx%02hhx-%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
304 &uuid->time_low, &uuid->time_mid, &uuid->time_hi_and_version,
305 &uuid->clock_seq_hi_and_reserved, &uuid->clock_seq_low,
306 &uuid->node[0], &uuid->node[1], &uuid->node[2], &uuid->node[3],
307 &uuid->node[4], &uuid->node[5]);
308 }
309
310 /*
311 * from sys/kern/kern_uuid.c
312 */
313 PUBLIC int
314 uuid_snprintf(char *buf, size_t sz, const struct uuid *uuid)
315 {
316
317 return snprintf(buf, sz,
318 "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
319 uuid->time_low, uuid->time_mid, uuid->time_hi_and_version,
320 uuid->clock_seq_hi_and_reserved, uuid->clock_seq_low,
321 uuid->node[0], uuid->node[1], uuid->node[2], uuid->node[3],
322 uuid->node[4], uuid->node[5]);
323 }
324
325 PUBLIC int
326 uuid_printf(const struct uuid *uuid)
327 {
328 char buf[UUID_STR_LEN];
329
330 (void) uuid_snprintf(buf, sizeof(buf), uuid);
331 printf("%s", buf);
332 return (0);
333 }
334
335 /************************************************************************
336 * Misc routines
337 */
338
339 PUBLIC void
340 show_data(const uint8_t *buf, size_t len, const char *prefix)
341 {
342 uint8_t line_buf[17];
343 size_t i;
344
345 line_buf[16] = '\0';
346 for (i = 0; i < len; i++) {
347 if ((i & 0xf) == 0)
348 printf("%s%08zx: ", prefix, i);
349 line_buf[i & 0xf] = isprint((int)buf[i]) ? buf[i] : '.';
350 printf("%02x ", buf[i]);
351 if ((i & 0xf) == 0xf)
352 printf(" %s\n", line_buf);
353 else if ((i & 0x7) == 0x7)
354 printf(" ");
355 }
356 i &= 0xf;
357 if (i != 0) {
358 line_buf[i] = '\0';
359 if (i < 8)
360 printf(" ");
361 while (i++ < 16)
362 printf(" ");
363
364 printf(" %s\n", line_buf);
365 }
366 }
367
368 PUBLIC uint16_t
369 strtous(const char *str, char **endptr, int base)
370 {
371 uintmax_t val;
372 int rstatus;
373
374 val = strtou(str, endptr, base, 0, USHRT_MAX, &rstatus);
375
376 switch (rstatus) {
377 case EINVAL:
378 /*###378 [lint] warning constant in conditional context [161]%%%*/
379 assert(0);
380 /*###379 [lint] warning 'break' statement not reached [193]%%%*/
381 break;
382 case ENOTSUP:
383 if (endptr != NULL)
384 break;
385 /*FALLTHROUGH*/
386 case ECANCELED:
387 err(EXIT_FAILURE, "invalid numeric string: %s\n", str);
388 case ERANGE:
389 err(EXIT_FAILURE, "value out of range [0,%#x]: %s\n",
390 USHRT_MAX, str);
391 default:
392 break;
393 }
394
395 return (uint16_t)val;
396 }
397
398 char *
399 read_file(const char *fname, size_t *size)
400 {
401 char *buf, *cp, *ep;
402 size_t bufsz, cnt, sz;
403 ssize_t ssz;
404 int fd, fd_flags;
405
406 assert(fname != NULL);
407 if (fname == NULL)
408 return 0;
409
410 if (strcmp(fname, "-") == 0) {
411 fd = STDIN_FILENO;
412 if ((fd_flags = fcntl(fd, F_GETFL)) == -1)
413 err(EXIT_FAILURE, "fcntl F_GETFL");
414
415 if (fcntl(fd, F_SETFL, O_NONBLOCK | fd_flags) == -1)
416 err(EXIT_FAILURE, "fcntl F_SETFL");
417 }
418 else {
419 fd_flags = -1;
420 fd = open(fname, O_RDONLY);
421 if (fd == -1)
422 err(EXIT_FAILURE, "open");
423 }
424
425 bufsz = 0x800;
426 buf = emalloc(bufsz);
427 cp = buf;
428 ep = buf + bufsz;
429 cnt = 0;
430 for (;;) {
431 ssz = read(fd, cp, (size_t)(ep - cp));
432 if (ssz == -1) {
433 if (errno == EAGAIN)
434 continue;
435 err(EXIT_FAILURE, "read");
436 }
437 assert(ssz >= 0);
438 #if 0
439 printf("ssz: %zd\n", ssz);
440 show_data((uint8_t *)cp, (size_t)ssz, "");
441 #endif
442 if (ssz == 0)
443 break;
444
445 cp += ssz;
446 sz = (size_t)ssz;
447 cnt += sz;
448
449 if (cp < ep) {
450 /* XXX: what about UCS-2? */
451 *cp = '\0';
452 cnt++;
453 break;
454 }
455
456 if (cp == ep) {
457 bufsz *= 2;
458 buf = erealloc(buf, bufsz);
459 cp = buf + cnt;
460 ep = buf + bufsz;
461 }
462 }
463 if (fd_flags != -1)
464 fcntl(fd, F_SETFL, fd_flags);
465 else
466 close(fd);
467
468 *size = cnt;
469 return buf;
470 }
471