preconv.c revision 1.1 1 1.1 joerg /* $Vendor-Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */
2 1.1 joerg /*
3 1.1 joerg * Copyright (c) 2011 Kristaps Dzonsons <kristaps (at) bsd.lv>
4 1.1 joerg *
5 1.1 joerg * Permission to use, copy, modify, and distribute this software for any
6 1.1 joerg * purpose with or without fee is hereby granted, provided that the above
7 1.1 joerg * copyright notice and this permission notice appear in all copies.
8 1.1 joerg *
9 1.1 joerg * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 1.1 joerg * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 1.1 joerg * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 1.1 joerg * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 1.1 joerg * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 1.1 joerg * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 1.1 joerg * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 1.1 joerg */
17 1.1 joerg #ifdef HAVE_CONFIG_H
18 1.1 joerg #include "config.h"
19 1.1 joerg #endif
20 1.1 joerg
21 1.1 joerg #ifdef HAVE_MMAP
22 1.1 joerg #include <sys/stat.h>
23 1.1 joerg #include <sys/mman.h>
24 1.1 joerg #endif
25 1.1 joerg
26 1.1 joerg #include <assert.h>
27 1.1 joerg #include <fcntl.h>
28 1.1 joerg #include <stdio.h>
29 1.1 joerg #include <stdlib.h>
30 1.1 joerg #include <string.h>
31 1.1 joerg #include <unistd.h>
32 1.1 joerg
33 1.1 joerg /*
34 1.1 joerg * The read_whole_file() and resize_buf() functions are copied from
35 1.1 joerg * read.c, including all dependency code (MAP_FILE, etc.).
36 1.1 joerg */
37 1.1 joerg
38 1.1 joerg #ifndef MAP_FILE
39 1.1 joerg #define MAP_FILE 0
40 1.1 joerg #endif
41 1.1 joerg
42 1.1 joerg enum enc {
43 1.1 joerg ENC_UTF_8, /* UTF-8 */
44 1.1 joerg ENC_US_ASCII, /* US-ASCII */
45 1.1 joerg ENC_LATIN_1, /* Latin-1 */
46 1.1 joerg ENC__MAX
47 1.1 joerg };
48 1.1 joerg
49 1.1 joerg struct buf {
50 1.1 joerg char *buf; /* binary input buffer */
51 1.1 joerg size_t sz; /* size of binary buffer */
52 1.1 joerg size_t offs; /* starting buffer offset */
53 1.1 joerg };
54 1.1 joerg
55 1.1 joerg struct encode {
56 1.1 joerg const char *name;
57 1.1 joerg int (*conv)(const struct buf *);
58 1.1 joerg };
59 1.1 joerg
60 1.1 joerg static int cue_enc(const struct buf *, size_t *, enum enc *);
61 1.1 joerg static int conv_latin_1(const struct buf *);
62 1.1 joerg static int conv_us_ascii(const struct buf *);
63 1.1 joerg static int conv_utf_8(const struct buf *);
64 1.1 joerg static int read_whole_file(const char *, int,
65 1.1 joerg struct buf *, int *);
66 1.1 joerg static void resize_buf(struct buf *, size_t);
67 1.1 joerg static void usage(void);
68 1.1 joerg
69 1.1 joerg static const struct encode encs[ENC__MAX] = {
70 1.1 joerg { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
71 1.1 joerg { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
72 1.1 joerg { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
73 1.1 joerg };
74 1.1 joerg
75 1.1 joerg static const char *progname;
76 1.1 joerg
77 1.1 joerg static void
78 1.1 joerg usage(void)
79 1.1 joerg {
80 1.1 joerg
81 1.1 joerg fprintf(stderr, "usage: %s "
82 1.1 joerg "[-D enc] "
83 1.1 joerg "[-e ENC] "
84 1.1 joerg "[file]\n", progname);
85 1.1 joerg }
86 1.1 joerg
87 1.1 joerg static int
88 1.1 joerg conv_latin_1(const struct buf *b)
89 1.1 joerg {
90 1.1 joerg size_t i;
91 1.1 joerg unsigned char cu;
92 1.1 joerg const char *cp;
93 1.1 joerg
94 1.1 joerg cp = b->buf + (int)b->offs;
95 1.1 joerg
96 1.1 joerg /*
97 1.1 joerg * Latin-1 falls into the first 256 code-points of Unicode, so
98 1.1 joerg * there's no need for any sort of translation. Just make the
99 1.1 joerg * 8-bit characters use the Unicode escape.
100 1.1 joerg * Note that binary values 128 < v < 160 are passed through
101 1.1 joerg * unmodified to mandoc.
102 1.1 joerg */
103 1.1 joerg
104 1.1 joerg for (i = b->offs; i < b->sz; i++) {
105 1.1 joerg cu = (unsigned char)*cp++;
106 1.1 joerg cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
107 1.1 joerg }
108 1.1 joerg
109 1.1 joerg return(1);
110 1.1 joerg }
111 1.1 joerg
112 1.1 joerg static int
113 1.1 joerg conv_us_ascii(const struct buf *b)
114 1.1 joerg {
115 1.1 joerg
116 1.1 joerg /*
117 1.1 joerg * US-ASCII has no conversion since it falls into the first 128
118 1.1 joerg * bytes of Unicode.
119 1.1 joerg */
120 1.1 joerg
121 1.1 joerg fwrite(b->buf, 1, b->sz, stdout);
122 1.1 joerg return(1);
123 1.1 joerg }
124 1.1 joerg
125 1.1 joerg static int
126 1.1 joerg conv_utf_8(const struct buf *b)
127 1.1 joerg {
128 1.1 joerg int state, be;
129 1.1 joerg unsigned int accum;
130 1.1 joerg size_t i;
131 1.1 joerg unsigned char cu;
132 1.1 joerg const char *cp;
133 1.1 joerg const long one = 1L;
134 1.1 joerg
135 1.1 joerg cp = b->buf + (int)b->offs;
136 1.1 joerg state = 0;
137 1.1 joerg accum = 0U;
138 1.1 joerg be = 0;
139 1.1 joerg
140 1.1 joerg /* Quick test for big-endian value. */
141 1.1 joerg
142 1.1 joerg if ( ! (*((const char *)(&one))))
143 1.1 joerg be = 1;
144 1.1 joerg
145 1.1 joerg for (i = b->offs; i < b->sz; i++) {
146 1.1 joerg cu = (unsigned char)*cp++;
147 1.1 joerg if (state) {
148 1.1 joerg if ( ! (cu & 128) || (cu & 64)) {
149 1.1 joerg /* Bad sequence header. */
150 1.1 joerg return(0);
151 1.1 joerg }
152 1.1 joerg
153 1.1 joerg /* Accept only legitimate bit patterns. */
154 1.1 joerg
155 1.1 joerg if (cu > 191 || cu < 128) {
156 1.1 joerg /* Bad in-sequence bits. */
157 1.1 joerg return(0);
158 1.1 joerg }
159 1.1 joerg
160 1.1 joerg accum |= (cu & 63) << --state * 6;
161 1.1 joerg
162 1.1 joerg /*
163 1.1 joerg * Accum is held in little-endian order as
164 1.1 joerg * stipulated by the UTF-8 sequence coding. We
165 1.1 joerg * need to convert to a native big-endian if our
166 1.1 joerg * architecture requires it.
167 1.1 joerg */
168 1.1 joerg
169 1.1 joerg if (0 == state && be)
170 1.1 joerg accum = (accum >> 24) |
171 1.1 joerg ((accum << 8) & 0x00FF0000) |
172 1.1 joerg ((accum >> 8) & 0x0000FF00) |
173 1.1 joerg (accum << 24);
174 1.1 joerg
175 1.1 joerg if (0 == state) {
176 1.1 joerg accum < 128U ? putchar(accum) :
177 1.1 joerg printf("\\[u%.4X]", accum);
178 1.1 joerg accum = 0U;
179 1.1 joerg }
180 1.1 joerg } else if (cu & (1 << 7)) {
181 1.1 joerg /*
182 1.1 joerg * Entering a UTF-8 state: if we encounter a
183 1.1 joerg * UTF-8 bitmask, calculate the expected UTF-8
184 1.1 joerg * state from it.
185 1.1 joerg */
186 1.1 joerg for (state = 0; state < 7; state++)
187 1.1 joerg if ( ! (cu & (1 << (7 - state))))
188 1.1 joerg break;
189 1.1 joerg
190 1.1 joerg /* Accept only legitimate bit patterns. */
191 1.1 joerg
192 1.1 joerg switch (state) {
193 1.1 joerg case (4):
194 1.1 joerg if (cu <= 244 && cu >= 240) {
195 1.1 joerg accum = (cu & 7) << 18;
196 1.1 joerg break;
197 1.1 joerg }
198 1.1 joerg /* Bad 4-sequence start bits. */
199 1.1 joerg return(0);
200 1.1 joerg case (3):
201 1.1 joerg if (cu <= 239 && cu >= 224) {
202 1.1 joerg accum = (cu & 15) << 12;
203 1.1 joerg break;
204 1.1 joerg }
205 1.1 joerg /* Bad 3-sequence start bits. */
206 1.1 joerg return(0);
207 1.1 joerg case (2):
208 1.1 joerg if (cu <= 223 && cu >= 194) {
209 1.1 joerg accum = (cu & 31) << 6;
210 1.1 joerg break;
211 1.1 joerg }
212 1.1 joerg /* Bad 2-sequence start bits. */
213 1.1 joerg return(0);
214 1.1 joerg default:
215 1.1 joerg /* Bad sequence bit mask. */
216 1.1 joerg return(0);
217 1.1 joerg }
218 1.1 joerg state--;
219 1.1 joerg } else
220 1.1 joerg putchar(cu);
221 1.1 joerg }
222 1.1 joerg
223 1.1 joerg if (0 != state) {
224 1.1 joerg /* Bad trailing bits. */
225 1.1 joerg return(0);
226 1.1 joerg }
227 1.1 joerg
228 1.1 joerg return(1);
229 1.1 joerg }
230 1.1 joerg
231 1.1 joerg static void
232 1.1 joerg resize_buf(struct buf *buf, size_t initial)
233 1.1 joerg {
234 1.1 joerg
235 1.1 joerg buf->sz = buf->sz > initial / 2 ?
236 1.1 joerg 2 * buf->sz : initial;
237 1.1 joerg
238 1.1 joerg buf->buf = realloc(buf->buf, buf->sz);
239 1.1 joerg if (NULL == buf->buf) {
240 1.1 joerg perror(NULL);
241 1.1 joerg exit(EXIT_FAILURE);
242 1.1 joerg }
243 1.1 joerg }
244 1.1 joerg
245 1.1 joerg static int
246 1.1 joerg read_whole_file(const char *f, int fd,
247 1.1 joerg struct buf *fb, int *with_mmap)
248 1.1 joerg {
249 1.1 joerg size_t off;
250 1.1 joerg ssize_t ssz;
251 1.1 joerg
252 1.1 joerg #ifdef HAVE_MMAP
253 1.1 joerg struct stat st;
254 1.1 joerg if (-1 == fstat(fd, &st)) {
255 1.1 joerg perror(f);
256 1.1 joerg return(0);
257 1.1 joerg }
258 1.1 joerg
259 1.1 joerg /*
260 1.1 joerg * If we're a regular file, try just reading in the whole entry
261 1.1 joerg * via mmap(). This is faster than reading it into blocks, and
262 1.1 joerg * since each file is only a few bytes to begin with, I'm not
263 1.1 joerg * concerned that this is going to tank any machines.
264 1.1 joerg */
265 1.1 joerg
266 1.1 joerg if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
267 1.1 joerg fprintf(stderr, "%s: input too large\n", f);
268 1.1 joerg return(0);
269 1.1 joerg }
270 1.1 joerg
271 1.1 joerg if (S_ISREG(st.st_mode)) {
272 1.1 joerg *with_mmap = 1;
273 1.1 joerg fb->sz = (size_t)st.st_size;
274 1.1 joerg fb->buf = mmap(NULL, fb->sz, PROT_READ,
275 1.1 joerg MAP_FILE|MAP_SHARED, fd, 0);
276 1.1 joerg if (fb->buf != MAP_FAILED)
277 1.1 joerg return(1);
278 1.1 joerg }
279 1.1 joerg #endif
280 1.1 joerg
281 1.1 joerg /*
282 1.1 joerg * If this isn't a regular file (like, say, stdin), then we must
283 1.1 joerg * go the old way and just read things in bit by bit.
284 1.1 joerg */
285 1.1 joerg
286 1.1 joerg *with_mmap = 0;
287 1.1 joerg off = 0;
288 1.1 joerg fb->sz = 0;
289 1.1 joerg fb->buf = NULL;
290 1.1 joerg for (;;) {
291 1.1 joerg if (off == fb->sz && fb->sz == (1U << 31)) {
292 1.1 joerg fprintf(stderr, "%s: input too large\n", f);
293 1.1 joerg break;
294 1.1 joerg }
295 1.1 joerg
296 1.1 joerg if (off == fb->sz)
297 1.1 joerg resize_buf(fb, 65536);
298 1.1 joerg
299 1.1 joerg ssz = read(fd, fb->buf + (int)off, fb->sz - off);
300 1.1 joerg if (ssz == 0) {
301 1.1 joerg fb->sz = off;
302 1.1 joerg return(1);
303 1.1 joerg }
304 1.1 joerg if (ssz == -1) {
305 1.1 joerg perror(f);
306 1.1 joerg break;
307 1.1 joerg }
308 1.1 joerg off += (size_t)ssz;
309 1.1 joerg }
310 1.1 joerg
311 1.1 joerg free(fb->buf);
312 1.1 joerg fb->buf = NULL;
313 1.1 joerg return(0);
314 1.1 joerg }
315 1.1 joerg
316 1.1 joerg static int
317 1.1 joerg cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
318 1.1 joerg {
319 1.1 joerg const char *ln, *eoln, *eoph;
320 1.1 joerg size_t sz, phsz, nsz;
321 1.1 joerg int i;
322 1.1 joerg
323 1.1 joerg ln = b->buf + (int)*offs;
324 1.1 joerg sz = b->sz - *offs;
325 1.1 joerg
326 1.1 joerg /* Look for the end-of-line. */
327 1.1 joerg
328 1.1 joerg if (NULL == (eoln = memchr(ln, '\n', sz)))
329 1.1 joerg return(-1);
330 1.1 joerg
331 1.1 joerg /* Set next-line marker. */
332 1.1 joerg
333 1.1 joerg *offs = (size_t)((eoln + 1) - b->buf);
334 1.1 joerg
335 1.1 joerg /* Check if we have the correct header/trailer. */
336 1.1 joerg
337 1.1 joerg if ((sz = (size_t)(eoln - ln)) < 10 ||
338 1.1 joerg memcmp(ln, ".\\\" -*-", 7) ||
339 1.1 joerg memcmp(eoln - 3, "-*-", 3))
340 1.1 joerg return(0);
341 1.1 joerg
342 1.1 joerg /* Move after the header and adjust for the trailer. */
343 1.1 joerg
344 1.1 joerg ln += 7;
345 1.1 joerg sz -= 10;
346 1.1 joerg
347 1.1 joerg while (sz > 0) {
348 1.1 joerg while (sz > 0 && ' ' == *ln) {
349 1.1 joerg ln++;
350 1.1 joerg sz--;
351 1.1 joerg }
352 1.1 joerg if (0 == sz)
353 1.1 joerg break;
354 1.1 joerg
355 1.1 joerg /* Find the end-of-phrase marker (or eoln). */
356 1.1 joerg
357 1.1 joerg if (NULL == (eoph = memchr(ln, ';', sz)))
358 1.1 joerg eoph = eoln - 3;
359 1.1 joerg else
360 1.1 joerg eoph++;
361 1.1 joerg
362 1.1 joerg /* Only account for the "coding" phrase. */
363 1.1 joerg
364 1.1 joerg if ((phsz = (size_t)(eoph - ln)) < 7 ||
365 1.1 joerg strncasecmp(ln, "coding:", 7)) {
366 1.1 joerg sz -= phsz;
367 1.1 joerg ln += phsz;
368 1.1 joerg continue;
369 1.1 joerg }
370 1.1 joerg
371 1.1 joerg sz -= 7;
372 1.1 joerg ln += 7;
373 1.1 joerg
374 1.1 joerg while (sz > 0 && ' ' == *ln) {
375 1.1 joerg ln++;
376 1.1 joerg sz--;
377 1.1 joerg }
378 1.1 joerg if (0 == sz)
379 1.1 joerg break;
380 1.1 joerg
381 1.1 joerg /* Check us against known encodings. */
382 1.1 joerg
383 1.1 joerg for (i = 0; i < (int)ENC__MAX; i++) {
384 1.1 joerg nsz = strlen(encs[i].name);
385 1.1 joerg if (phsz < nsz)
386 1.1 joerg continue;
387 1.1 joerg if (strncasecmp(ln, encs[i].name, nsz))
388 1.1 joerg continue;
389 1.1 joerg
390 1.1 joerg *enc = (enum enc)i;
391 1.1 joerg return(1);
392 1.1 joerg }
393 1.1 joerg
394 1.1 joerg /* Unknown encoding. */
395 1.1 joerg
396 1.1 joerg *enc = ENC__MAX;
397 1.1 joerg return(1);
398 1.1 joerg }
399 1.1 joerg
400 1.1 joerg return(0);
401 1.1 joerg }
402 1.1 joerg
403 1.1 joerg int
404 1.1 joerg main(int argc, char *argv[])
405 1.1 joerg {
406 1.1 joerg int i, ch, map, fd, rc;
407 1.1 joerg struct buf b;
408 1.1 joerg const char *fn;
409 1.1 joerg enum enc enc, def;
410 1.1 joerg unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
411 1.1 joerg size_t offs;
412 1.1 joerg extern int optind;
413 1.1 joerg extern char *optarg;
414 1.1 joerg
415 1.1 joerg progname = strrchr(argv[0], '/');
416 1.1 joerg if (progname == NULL)
417 1.1 joerg progname = argv[0];
418 1.1 joerg else
419 1.1 joerg ++progname;
420 1.1 joerg
421 1.1 joerg fn = "<stdin>";
422 1.1 joerg fd = STDIN_FILENO;
423 1.1 joerg rc = EXIT_FAILURE;
424 1.1 joerg enc = def = ENC__MAX;
425 1.1 joerg map = 0;
426 1.1 joerg
427 1.1 joerg memset(&b, 0, sizeof(struct buf));
428 1.1 joerg
429 1.1 joerg while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
430 1.1 joerg switch (ch) {
431 1.1 joerg case ('D'):
432 1.1 joerg /* FALLTHROUGH */
433 1.1 joerg case ('e'):
434 1.1 joerg for (i = 0; i < (int)ENC__MAX; i++) {
435 1.1 joerg if (strcasecmp(optarg, encs[i].name))
436 1.1 joerg continue;
437 1.1 joerg break;
438 1.1 joerg }
439 1.1 joerg if (i < (int)ENC__MAX) {
440 1.1 joerg if ('D' == ch)
441 1.1 joerg def = (enum enc)i;
442 1.1 joerg else
443 1.1 joerg enc = (enum enc)i;
444 1.1 joerg break;
445 1.1 joerg }
446 1.1 joerg
447 1.1 joerg fprintf(stderr, "%s: Bad encoding\n", optarg);
448 1.1 joerg return(EXIT_FAILURE);
449 1.1 joerg case ('r'):
450 1.1 joerg /* FALLTHROUGH */
451 1.1 joerg case ('d'):
452 1.1 joerg /* FALLTHROUGH */
453 1.1 joerg case ('v'):
454 1.1 joerg /* Compatibility with GNU preconv. */
455 1.1 joerg break;
456 1.1 joerg case ('h'):
457 1.1 joerg /* Compatibility with GNU preconv. */
458 1.1 joerg /* FALLTHROUGH */
459 1.1 joerg default:
460 1.1 joerg usage();
461 1.1 joerg return(EXIT_FAILURE);
462 1.1 joerg }
463 1.1 joerg
464 1.1 joerg argc -= optind;
465 1.1 joerg argv += optind;
466 1.1 joerg
467 1.1 joerg /*
468 1.1 joerg * Open and read the first argument on the command-line.
469 1.1 joerg * If we don't have one, we default to stdin.
470 1.1 joerg */
471 1.1 joerg
472 1.1 joerg if (argc > 0) {
473 1.1 joerg fn = *argv;
474 1.1 joerg fd = open(fn, O_RDONLY, 0);
475 1.1 joerg if (-1 == fd) {
476 1.1 joerg perror(fn);
477 1.1 joerg return(EXIT_FAILURE);
478 1.1 joerg }
479 1.1 joerg }
480 1.1 joerg
481 1.1 joerg if ( ! read_whole_file(fn, fd, &b, &map))
482 1.1 joerg goto out;
483 1.1 joerg
484 1.1 joerg /* Try to read the UTF-8 BOM. */
485 1.1 joerg
486 1.1 joerg if (ENC__MAX == enc)
487 1.1 joerg if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
488 1.1 joerg b.offs = 3;
489 1.1 joerg enc = ENC_UTF_8;
490 1.1 joerg }
491 1.1 joerg
492 1.1 joerg /* Try reading from the "-*-" cue. */
493 1.1 joerg
494 1.1 joerg if (ENC__MAX == enc) {
495 1.1 joerg offs = b.offs;
496 1.1 joerg ch = cue_enc(&b, &offs, &enc);
497 1.1 joerg if (0 == ch)
498 1.1 joerg ch = cue_enc(&b, &offs, &enc);
499 1.1 joerg }
500 1.1 joerg
501 1.1 joerg /*
502 1.1 joerg * No encoding has been detected.
503 1.1 joerg * Thus, we either fall into our default encoder, if specified,
504 1.1 joerg * or use Latin-1 if all else fails.
505 1.1 joerg */
506 1.1 joerg
507 1.1 joerg if (ENC__MAX == enc)
508 1.1 joerg enc = ENC__MAX == def ? ENC_LATIN_1 : def;
509 1.1 joerg
510 1.1 joerg if ( ! (*encs[(int)enc].conv)(&b)) {
511 1.1 joerg fprintf(stderr, "%s: Bad encoding\n", fn);
512 1.1 joerg goto out;
513 1.1 joerg }
514 1.1 joerg
515 1.1 joerg rc = EXIT_SUCCESS;
516 1.1 joerg out:
517 1.1 joerg #ifdef HAVE_MMAP
518 1.1 joerg if (map)
519 1.1 joerg munmap(b.buf, b.sz);
520 1.1 joerg else
521 1.1 joerg #endif
522 1.1 joerg free(b.buf);
523 1.1 joerg
524 1.1 joerg if (fd > STDIN_FILENO)
525 1.1 joerg close(fd);
526 1.1 joerg
527 1.1 joerg return(rc);
528 1.1 joerg }
529