1 1.5 christos /* Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp */ 2 1.1 joerg /* 3 1.1 joerg * Copyright (c) 2011 Kristaps Dzonsons <kristaps (at) bsd.lv> 4 1.2 christos * Copyright (c) 2014 Ingo Schwarze <schwarze (at) openbsd.org> 5 1.1 joerg * 6 1.1 joerg * Permission to use, copy, modify, and distribute this software for any 7 1.1 joerg * purpose with or without fee is hereby granted, provided that the above 8 1.1 joerg * copyright notice and this permission notice appear in all copies. 9 1.1 joerg * 10 1.1 joerg * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 1.1 joerg * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 1.1 joerg * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 1.1 joerg * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 1.1 joerg * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 1.1 joerg * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 1.1 joerg * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 1.1 joerg */ 18 1.1 joerg #include "config.h" 19 1.1 joerg 20 1.2 christos #include <sys/types.h> 21 1.1 joerg 22 1.1 joerg #include <assert.h> 23 1.1 joerg #include <stdio.h> 24 1.1 joerg #include <string.h> 25 1.5 christos 26 1.2 christos #include "mandoc.h" 27 1.5 christos #include "roff.h" 28 1.5 christos #include "mandoc_parse.h" 29 1.2 christos #include "libmandoc.h" 30 1.1 joerg 31 1.2 christos int 32 1.2 christos preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 33 1.2 christos int *filenc) 34 1.1 joerg { 35 1.2 christos const unsigned char *cu; 36 1.4 christos int nby; 37 1.4 christos unsigned int accum; 38 1.1 joerg 39 1.2 christos cu = (const unsigned char *)ib->buf + *ii; 40 1.2 christos assert(*cu & 0x80); 41 1.1 joerg 42 1.2 christos if ( ! (*filenc & MPARSE_UTF8)) 43 1.2 christos goto latin; 44 1.1 joerg 45 1.2 christos nby = 1; 46 1.2 christos while (nby < 5 && *cu & (1 << (7 - nby))) 47 1.2 christos nby++; 48 1.2 christos 49 1.2 christos switch (nby) { 50 1.2 christos case 2: 51 1.2 christos accum = *cu & 0x1f; 52 1.2 christos if (accum < 0x02) /* Obfuscated ASCII. */ 53 1.2 christos goto latin; 54 1.2 christos break; 55 1.2 christos case 3: 56 1.2 christos accum = *cu & 0x0f; 57 1.2 christos break; 58 1.2 christos case 4: 59 1.2 christos accum = *cu & 0x07; 60 1.2 christos if (accum > 0x04) /* Beyond Unicode. */ 61 1.2 christos goto latin; 62 1.2 christos break; 63 1.2 christos default: /* Bad sequence header. */ 64 1.2 christos goto latin; 65 1.2 christos } 66 1.2 christos 67 1.2 christos cu++; 68 1.2 christos switch (nby) { 69 1.2 christos case 3: 70 1.2 christos if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ 71 1.2 christos (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ 72 1.2 christos goto latin; 73 1.2 christos break; 74 1.2 christos case 4: 75 1.2 christos if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ 76 1.2 christos (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ 77 1.2 christos goto latin; 78 1.2 christos break; 79 1.2 christos default: 80 1.2 christos break; 81 1.2 christos } 82 1.2 christos 83 1.2 christos while (--nby) { 84 1.2 christos if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ 85 1.2 christos goto latin; 86 1.2 christos accum <<= 6; 87 1.2 christos accum += *cu & 0x3f; 88 1.2 christos cu++; 89 1.2 christos } 90 1.2 christos 91 1.2 christos assert(accum > 0x7f); 92 1.2 christos assert(accum < 0x110000); 93 1.2 christos assert(accum < 0xd800 || accum > 0xdfff); 94 1.2 christos 95 1.2 christos *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); 96 1.2 christos *ii = (const char *)cu - ib->buf; 97 1.2 christos *filenc &= ~MPARSE_LATIN1; 98 1.3 christos return 1; 99 1.1 joerg 100 1.2 christos latin: 101 1.2 christos if ( ! (*filenc & MPARSE_LATIN1)) 102 1.3 christos return 0; 103 1.1 joerg 104 1.2 christos *oi += snprintf(ob->buf + *oi, 11, 105 1.2 christos "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 106 1.1 joerg 107 1.2 christos *filenc &= ~MPARSE_UTF8; 108 1.3 christos return 1; 109 1.1 joerg } 110 1.1 joerg 111 1.2 christos int 112 1.2 christos preconv_cue(const struct buf *b, size_t offset) 113 1.1 joerg { 114 1.1 joerg const char *ln, *eoln, *eoph; 115 1.2 christos size_t sz, phsz; 116 1.1 joerg 117 1.2 christos ln = b->buf + offset; 118 1.2 christos sz = b->sz - offset; 119 1.1 joerg 120 1.1 joerg /* Look for the end-of-line. */ 121 1.1 joerg 122 1.1 joerg if (NULL == (eoln = memchr(ln, '\n', sz))) 123 1.2 christos eoln = ln + sz; 124 1.1 joerg 125 1.1 joerg /* Check if we have the correct header/trailer. */ 126 1.1 joerg 127 1.2 christos if ((sz = (size_t)(eoln - ln)) < 10 || 128 1.2 christos memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 129 1.3 christos return MPARSE_UTF8 | MPARSE_LATIN1; 130 1.1 joerg 131 1.1 joerg /* Move after the header and adjust for the trailer. */ 132 1.1 joerg 133 1.1 joerg ln += 7; 134 1.1 joerg sz -= 10; 135 1.1 joerg 136 1.1 joerg while (sz > 0) { 137 1.1 joerg while (sz > 0 && ' ' == *ln) { 138 1.1 joerg ln++; 139 1.1 joerg sz--; 140 1.1 joerg } 141 1.1 joerg if (0 == sz) 142 1.1 joerg break; 143 1.1 joerg 144 1.1 joerg /* Find the end-of-phrase marker (or eoln). */ 145 1.1 joerg 146 1.1 joerg if (NULL == (eoph = memchr(ln, ';', sz))) 147 1.1 joerg eoph = eoln - 3; 148 1.1 joerg else 149 1.1 joerg eoph++; 150 1.1 joerg 151 1.1 joerg /* Only account for the "coding" phrase. */ 152 1.1 joerg 153 1.2 christos if ((phsz = eoph - ln) < 7 || 154 1.2 christos strncasecmp(ln, "coding:", 7)) { 155 1.1 joerg sz -= phsz; 156 1.1 joerg ln += phsz; 157 1.1 joerg continue; 158 1.2 christos } 159 1.1 joerg 160 1.1 joerg sz -= 7; 161 1.1 joerg ln += 7; 162 1.1 joerg 163 1.1 joerg while (sz > 0 && ' ' == *ln) { 164 1.1 joerg ln++; 165 1.1 joerg sz--; 166 1.1 joerg } 167 1.1 joerg if (0 == sz) 168 1.3 christos return 0; 169 1.1 joerg 170 1.1 joerg /* Check us against known encodings. */ 171 1.1 joerg 172 1.2 christos if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 173 1.3 christos return MPARSE_UTF8; 174 1.2 christos if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 175 1.3 christos return MPARSE_LATIN1; 176 1.3 christos return 0; 177 1.1 joerg } 178 1.3 christos return MPARSE_UTF8 | MPARSE_LATIN1; 179 1.1 joerg } 180