Home | History | Annotate | Line # | Download | only in dist
preconv.c revision 1.5
      1  1.5  christos /*	Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp  */
      2  1.1     joerg /*
      3  1.1     joerg  * Copyright (c) 2011 Kristaps Dzonsons <kristaps (at) bsd.lv>
      4  1.2  christos  * Copyright (c) 2014 Ingo Schwarze <schwarze (at) openbsd.org>
      5  1.1     joerg  *
      6  1.1     joerg  * Permission to use, copy, modify, and distribute this software for any
      7  1.1     joerg  * purpose with or without fee is hereby granted, provided that the above
      8  1.1     joerg  * copyright notice and this permission notice appear in all copies.
      9  1.1     joerg  *
     10  1.1     joerg  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     11  1.1     joerg  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     12  1.1     joerg  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     13  1.1     joerg  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     14  1.1     joerg  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     15  1.1     joerg  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     16  1.1     joerg  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     17  1.1     joerg  */
     18  1.1     joerg #include "config.h"
     19  1.1     joerg 
     20  1.2  christos #include <sys/types.h>
     21  1.1     joerg 
     22  1.1     joerg #include <assert.h>
     23  1.1     joerg #include <stdio.h>
     24  1.1     joerg #include <string.h>
     25  1.5  christos 
     26  1.2  christos #include "mandoc.h"
     27  1.5  christos #include "roff.h"
     28  1.5  christos #include "mandoc_parse.h"
     29  1.2  christos #include "libmandoc.h"
     30  1.1     joerg 
     31  1.2  christos int
     32  1.2  christos preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
     33  1.2  christos     int *filenc)
     34  1.1     joerg {
     35  1.2  christos 	const unsigned char	*cu;
     36  1.4  christos 	int			 nby;
     37  1.4  christos 	unsigned int		 accum;
     38  1.1     joerg 
     39  1.2  christos 	cu = (const unsigned char *)ib->buf + *ii;
     40  1.2  christos 	assert(*cu & 0x80);
     41  1.1     joerg 
     42  1.2  christos 	if ( ! (*filenc & MPARSE_UTF8))
     43  1.2  christos 		goto latin;
     44  1.1     joerg 
     45  1.2  christos 	nby = 1;
     46  1.2  christos 	while (nby < 5 && *cu & (1 << (7 - nby)))
     47  1.2  christos 		nby++;
     48  1.2  christos 
     49  1.2  christos 	switch (nby) {
     50  1.2  christos 	case 2:
     51  1.2  christos 		accum = *cu & 0x1f;
     52  1.2  christos 		if (accum < 0x02)  /* Obfuscated ASCII. */
     53  1.2  christos 			goto latin;
     54  1.2  christos 		break;
     55  1.2  christos 	case 3:
     56  1.2  christos 		accum = *cu & 0x0f;
     57  1.2  christos 		break;
     58  1.2  christos 	case 4:
     59  1.2  christos 		accum = *cu & 0x07;
     60  1.2  christos 		if (accum > 0x04) /* Beyond Unicode. */
     61  1.2  christos 			goto latin;
     62  1.2  christos 		break;
     63  1.2  christos 	default:  /* Bad sequence header. */
     64  1.2  christos 		goto latin;
     65  1.2  christos 	}
     66  1.2  christos 
     67  1.2  christos 	cu++;
     68  1.2  christos 	switch (nby) {
     69  1.2  christos 	case 3:
     70  1.2  christos 		if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
     71  1.2  christos 		    (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
     72  1.2  christos 			goto latin;
     73  1.2  christos 		break;
     74  1.2  christos 	case 4:
     75  1.2  christos 		if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
     76  1.2  christos 		    (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
     77  1.2  christos 			goto latin;
     78  1.2  christos 		break;
     79  1.2  christos 	default:
     80  1.2  christos 		break;
     81  1.2  christos 	}
     82  1.2  christos 
     83  1.2  christos 	while (--nby) {
     84  1.2  christos 		if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
     85  1.2  christos 			goto latin;
     86  1.2  christos 		accum <<= 6;
     87  1.2  christos 		accum += *cu & 0x3f;
     88  1.2  christos 		cu++;
     89  1.2  christos 	}
     90  1.2  christos 
     91  1.2  christos 	assert(accum > 0x7f);
     92  1.2  christos 	assert(accum < 0x110000);
     93  1.2  christos 	assert(accum < 0xd800 || accum > 0xdfff);
     94  1.2  christos 
     95  1.2  christos 	*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
     96  1.2  christos 	*ii = (const char *)cu - ib->buf;
     97  1.2  christos 	*filenc &= ~MPARSE_LATIN1;
     98  1.3  christos 	return 1;
     99  1.1     joerg 
    100  1.2  christos latin:
    101  1.2  christos 	if ( ! (*filenc & MPARSE_LATIN1))
    102  1.3  christos 		return 0;
    103  1.1     joerg 
    104  1.2  christos 	*oi += snprintf(ob->buf + *oi, 11,
    105  1.2  christos 	    "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
    106  1.1     joerg 
    107  1.2  christos 	*filenc &= ~MPARSE_UTF8;
    108  1.3  christos 	return 1;
    109  1.1     joerg }
    110  1.1     joerg 
    111  1.2  christos int
    112  1.2  christos preconv_cue(const struct buf *b, size_t offset)
    113  1.1     joerg {
    114  1.1     joerg 	const char	*ln, *eoln, *eoph;
    115  1.2  christos 	size_t		 sz, phsz;
    116  1.1     joerg 
    117  1.2  christos 	ln = b->buf + offset;
    118  1.2  christos 	sz = b->sz - offset;
    119  1.1     joerg 
    120  1.1     joerg 	/* Look for the end-of-line. */
    121  1.1     joerg 
    122  1.1     joerg 	if (NULL == (eoln = memchr(ln, '\n', sz)))
    123  1.2  christos 		eoln = ln + sz;
    124  1.1     joerg 
    125  1.1     joerg 	/* Check if we have the correct header/trailer. */
    126  1.1     joerg 
    127  1.2  christos 	if ((sz = (size_t)(eoln - ln)) < 10 ||
    128  1.2  christos 	    memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
    129  1.3  christos 		return MPARSE_UTF8 | MPARSE_LATIN1;
    130  1.1     joerg 
    131  1.1     joerg 	/* Move after the header and adjust for the trailer. */
    132  1.1     joerg 
    133  1.1     joerg 	ln += 7;
    134  1.1     joerg 	sz -= 10;
    135  1.1     joerg 
    136  1.1     joerg 	while (sz > 0) {
    137  1.1     joerg 		while (sz > 0 && ' ' == *ln) {
    138  1.1     joerg 			ln++;
    139  1.1     joerg 			sz--;
    140  1.1     joerg 		}
    141  1.1     joerg 		if (0 == sz)
    142  1.1     joerg 			break;
    143  1.1     joerg 
    144  1.1     joerg 		/* Find the end-of-phrase marker (or eoln). */
    145  1.1     joerg 
    146  1.1     joerg 		if (NULL == (eoph = memchr(ln, ';', sz)))
    147  1.1     joerg 			eoph = eoln - 3;
    148  1.1     joerg 		else
    149  1.1     joerg 			eoph++;
    150  1.1     joerg 
    151  1.1     joerg 		/* Only account for the "coding" phrase. */
    152  1.1     joerg 
    153  1.2  christos 		if ((phsz = eoph - ln) < 7 ||
    154  1.2  christos 		    strncasecmp(ln, "coding:", 7)) {
    155  1.1     joerg 			sz -= phsz;
    156  1.1     joerg 			ln += phsz;
    157  1.1     joerg 			continue;
    158  1.2  christos 		}
    159  1.1     joerg 
    160  1.1     joerg 		sz -= 7;
    161  1.1     joerg 		ln += 7;
    162  1.1     joerg 
    163  1.1     joerg 		while (sz > 0 && ' ' == *ln) {
    164  1.1     joerg 			ln++;
    165  1.1     joerg 			sz--;
    166  1.1     joerg 		}
    167  1.1     joerg 		if (0 == sz)
    168  1.3  christos 			return 0;
    169  1.1     joerg 
    170  1.1     joerg 		/* Check us against known encodings. */
    171  1.1     joerg 
    172  1.2  christos 		if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
    173  1.3  christos 			return MPARSE_UTF8;
    174  1.2  christos 		if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
    175  1.3  christos 			return MPARSE_LATIN1;
    176  1.3  christos 		return 0;
    177  1.1     joerg 	}
    178  1.3  christos 	return MPARSE_UTF8 | MPARSE_LATIN1;
    179  1.1     joerg }
    180