Home | History | Annotate | Line # | Download | only in global
      1 /*	$NetBSD: tok822_parse.c,v 1.1.1.1 2009/06/23 10:08:48 tron Exp $	*/
      2 
      3 /*++
      4 /* NAME
      5 /*	tok822_parse 3
      6 /* SUMMARY
      7 /*	RFC 822 address parser
      8 /* SYNOPSIS
      9 /*	#include <tok822.h>
     10 /*
     11 /*	TOK822 *tok822_scan_limit(str, tailp, limit)
     12 /*	const char *str;
     13 /*	TOK822	**tailp;
     14 /*	int	limit;
     15 /*
     16 /*	TOK822 *tok822_scan(str, tailp)
     17 /*	const char *str;
     18 /*	TOK822	**tailp;
     19 /*
     20 /*	TOK822	*tok822_parse_limit(str, limit)
     21 /*	const char *str;
     22 /*	int	limit;
     23 /*
     24 /*	TOK822	*tok822_parse(str)
     25 /*	const char *str;
     26 /*
     27 /*	TOK822	*tok822_scan_addr(str)
     28 /*	const char *str;
     29 /*
     30 /*	VSTRING	*tok822_externalize(buffer, tree, flags)
     31 /*	VSTRING	*buffer;
     32 /*	TOK822	*tree;
     33 /*	int	flags;
     34 /*
     35 /*	VSTRING	*tok822_internalize(buffer, tree, flags)
     36 /*	VSTRING	*buffer;
     37 /*	TOK822	*tree;
     38 /*	int	flags;
     39 /* DESCRIPTION
     40 /*	This module converts address lists between string form and parse
     41 /*	tree formats. The string form can appear in two different ways:
     42 /*	external (or quoted) form, as used in message headers, and internal
     43 /*	(unquoted) form, as used internally by the mail software.
     44 /*	Although RFC 822 expects 7-bit data, these routines pay no
     45 /*	special attention to 8-bit characters.
     46 /*
     47 /*	tok822_scan() converts the external-form string in \fIstr\fR
     48 /*	to a linear token list. The \fItailp\fR argument is a null pointer
     49 /*	or receives the pointer value of the last result list element.
     50 /*
     51 /*	tok822_scan_limit() implements tok822_scan(), which is a macro.
     52 /*	The \fIlimit\fR argument is either zero or an upper bound on the
     53 /*	number of tokens produced.
     54 /*
     55 /*	tok822_parse() converts the external-form address list in
     56 /*	\fIstr\fR to the corresponding token tree. The parser is permissive
     57 /*	and will not throw away information that it does not understand.
     58 /*	The parser adds missing commas between addresses.
     59 /*
     60 /*	tok822_parse_limit() implements tok822_parse(), which is a macro.
     61 /*	The \fIlimit\fR argument is either zero or an upper bound on the
     62 /*	number of tokens produced.
     63 /*
     64 /*	tok822_scan_addr() converts the external-form string in
     65 /*	\fIstr\fR to an address token tree. This is just string to
     66 /*	token list conversion; no parsing is done. This routine is
     67 /*	suitable for data that should contain just one address and no
     68 /*	other information.
     69 /*
     70 /*	tok822_externalize() converts a token list to external form.
     71 /*	Where appropriate, characters and strings are quoted and white
     72 /*	space is inserted. The \fIflags\fR argument is the binary OR of
     73 /*	zero or more of the following:
     74 /* .IP TOK822_STR_WIPE
     75 /*	Initially, truncate the result to zero length.
     76 /* .IP TOK822_STR_TERM
     77 /*	Append a null terminator to the result when done.
     78 /* .IP TOK822_STR_LINE
     79 /*	Append a line break after each comma token, instead of appending
     80 /*	whitespace.  It is up to the caller to concatenate short lines to
     81 /*	produce longer ones.
     82 /* .IP TOK822_STR_TRNC
     83 /*	Truncate non-address information to 250 characters per address, to
     84 /*	protect Sendmail systems that are vulnerable to the problem in CERT
     85 /*	advisory CA-2003-07.
     86 /*	This flag has effect with tok822_externalize() only.
     87 /* .PP
     88 /*	The macro TOK_822_NONE expresses that none of the above features
     89 /*	should be activated.
     90 /*
     91 /*	The macro TOK822_STR_DEFL combines the TOK822_STR_WIPE and
     92 /*	TOK822_STR_TERM flags. This is useful for most token to string
     93 /*	conversions.
     94 /*
     95 /*	The macro TOK822_STR_HEAD combines the TOK822_STR_TERM,
     96 /*	TOK822_STR_LINE and TOK822_STR_TRNC flags. This is useful for
     97 /*	the special case of token to mail header conversion.
     98 /*
     99 /*	tok822_internalize() converts a token list to string form,
    100 /*	without quoting. White space is inserted where appropriate.
    101 /*	The \fIflags\fR argument is as with tok822_externalize().
    102 /* STANDARDS
    103 /* .ad
    104 /* .fi
    105 /*	RFC 822 (ARPA Internet Text Messages). In addition to this standard
    106 /*	this module implements additional operators such as % and !. These
    107 /*	are needed because the real world is not all RFC 822. Also, the ':'
    108 /*	operator is allowed to appear inside addresses, to accommodate DECnet.
    109 /*	In addition, 8-bit data is not given special treatment.
    110 /* LICENSE
    111 /* .ad
    112 /* .fi
    113 /*	The Secure Mailer license must be distributed with this software.
    114 /* AUTHOR(S)
    115 /*	Wietse Venema
    116 /*	IBM T.J. Watson Research
    117 /*	P.O. Box 704
    118 /*	Yorktown Heights, NY 10598, USA
    119 /*--*/
    120 
    121 /* System library. */
    122 
    123 #include <sys_defs.h>
    124 #include <ctype.h>
    125 #include <string.h>
    126 
    127 /* Utility library. */
    128 
    129 #include <vstring.h>
    130 #include <msg.h>
    131 #include <stringops.h>
    132 
    133 /* Global library. */
    134 
    135 #include "lex_822.h"
    136 #include "quote_822_local.h"
    137 #include "tok822.h"
    138 
    139  /*
    140   * I suppose this is my favorite macro. Used heavily for tokenizing.
    141   */
    142 #define COLLECT(t,s,c,cond) { \
    143 	while ((c = *(unsigned char *) s) != 0) { \
    144 	    if (c == '\\') { \
    145 		if ((c = *(unsigned char *)++s) == 0) \
    146 		    break; \
    147 	    } else if (!(cond)) { \
    148 		break; \
    149 	    } \
    150 	    VSTRING_ADDCH(t->vstr, IS_SPACE_TAB_CR_LF(c) ? ' ' : c); \
    151 	    s++; \
    152 	} \
    153 	VSTRING_TERMINATE(t->vstr); \
    154     }
    155 
    156 #define COLLECT_SKIP_LAST(t,s,c,cond) { COLLECT(t,s,c,cond); if (*s) s++; }
    157 
    158  /*
    159   * Not quite as complex. The parser depends heavily on it.
    160   */
    161 #define SKIP(tp, cond) { \
    162 	while (tp->type && (cond)) \
    163 	    tp = tp->prev; \
    164     }
    165 
    166 #define MOVE_COMMENT_AND_CONTINUE(tp, right) { \
    167 	TOK822 *prev = tok822_unlink(tp); \
    168 	right = tok822_prepend(right, tp); \
    169 	tp = prev; \
    170 	continue; \
    171     }
    172 
    173 #define SKIP_MOVE_COMMENT(tp, cond, right) { \
    174 	while (tp->type && (cond)) { \
    175 	    if (tp->type == TOK822_COMMENT) \
    176 		MOVE_COMMENT_AND_CONTINUE(tp, right); \
    177 	    tp = tp->prev; \
    178 	} \
    179     }
    180 
    181  /*
    182   * Single-character operators. We include the % and ! operators because not
    183   * all the world is RFC822. XXX Make this operator list configurable when we
    184   * have a real rewriting language. Include | for aliases file parsing.
    185   */
    186 static char tok822_opchar[] = "|%!" LEX_822_SPECIALS;
    187 static void tok822_quote_atom(TOK822 *);
    188 static const char *tok822_comment(TOK822 *, const char *);
    189 static TOK822 *tok822_group(int, TOK822 *, TOK822 *, int);
    190 static void tok822_copy_quoted(VSTRING *, char *, char *);
    191 static int tok822_append_space(TOK822 *);
    192 
    193 #define DO_WORD		(1<<0)		/* finding a word is ok here */
    194 #define DO_GROUP	(1<<1)		/* doing an address group */
    195 
    196 #define ADD_COMMA	','		/* resynchronize */
    197 #define NO_MISSING_COMMA 0
    198 
    199 /* tok822_internalize - token tree to string, internal form */
    200 
    201 VSTRING *tok822_internalize(VSTRING *vp, TOK822 *tree, int flags)
    202 {
    203     TOK822 *tp;
    204 
    205     if (flags & TOK822_STR_WIPE)
    206 	VSTRING_RESET(vp);
    207 
    208     for (tp = tree; tp; tp = tp->next) {
    209 	switch (tp->type) {
    210 	case ',':
    211 	    VSTRING_ADDCH(vp, tp->type);
    212 	    if (flags & TOK822_STR_LINE) {
    213 		VSTRING_ADDCH(vp, '\n');
    214 		continue;
    215 	    }
    216 	    break;
    217 	case TOK822_ADDR:
    218 	    tok822_internalize(vp, tp->head, TOK822_STR_NONE);
    219 	    break;
    220 	case TOK822_COMMENT:
    221 	case TOK822_ATOM:
    222 	case TOK822_QSTRING:
    223 	    vstring_strcat(vp, vstring_str(tp->vstr));
    224 	    break;
    225 	case TOK822_DOMLIT:
    226 	    VSTRING_ADDCH(vp, '[');
    227 	    vstring_strcat(vp, vstring_str(tp->vstr));
    228 	    VSTRING_ADDCH(vp, ']');
    229 	    break;
    230 	case TOK822_STARTGRP:
    231 	    VSTRING_ADDCH(vp, ':');
    232 	    break;
    233 	default:
    234 	    if (tp->type >= TOK822_MINTOK)
    235 		msg_panic("tok822_internalize: unknown operator %d", tp->type);
    236 	    VSTRING_ADDCH(vp, tp->type);
    237 	}
    238 	if (tok822_append_space(tp))
    239 	    VSTRING_ADDCH(vp, ' ');
    240     }
    241     if (flags & TOK822_STR_TERM)
    242 	VSTRING_TERMINATE(vp);
    243     return (vp);
    244 }
    245 
    246 /* strip_address - strip non-address text from address expression */
    247 
    248 static void strip_address(VSTRING *vp, ssize_t start, TOK822 *addr)
    249 {
    250     VSTRING *tmp;
    251 
    252     /*
    253      * Emit plain <address>. Discard any comments or phrases.
    254      */
    255     VSTRING_TERMINATE(vp);
    256     msg_warn("stripping too many comments from address: %.100s...",
    257 	     printable(vstring_str(vp) + start, '?'));
    258     vstring_truncate(vp, start);
    259     VSTRING_ADDCH(vp, '<');
    260     if (addr) {
    261 	tmp = vstring_alloc(100);
    262 	tok822_internalize(tmp, addr, TOK822_STR_TERM);
    263 	quote_822_local_flags(vp, vstring_str(tmp),
    264 			      QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
    265 	vstring_free(tmp);
    266     }
    267     VSTRING_ADDCH(vp, '>');
    268 }
    269 
    270 /* tok822_externalize - token tree to string, external form */
    271 
    272 VSTRING *tok822_externalize(VSTRING *vp, TOK822 *tree, int flags)
    273 {
    274     VSTRING *tmp;
    275     TOK822 *tp;
    276     ssize_t start;
    277     TOK822 *addr;
    278     ssize_t addr_len;
    279 
    280     /*
    281      * Guard against a Sendmail buffer overflow (CERT advisory CA-2003-07).
    282      * The problem was that Sendmail could store too much non-address text
    283      * (comments, phrases, etc.) into a static 256-byte buffer.
    284      *
    285      * When the buffer fills up, fixed Sendmail versions remove comments etc.
    286      * and reduce the information to just <$g>, which expands to <address>.
    287      * No change is made when an address expression (text separated by
    288      * commas) contains no address. This fix reportedly also protects
    289      * Sendmail systems that are still vulnerable to this problem.
    290      *
    291      * Postfix takes the same approach, grudgingly. To avoid unnecessary damage,
    292      * Postfix removes comments etc. only when the amount of non-address text
    293      * in an address expression (text separated by commas) exceeds 250 bytes.
    294      *
    295      * With Sendmail, the address part of an address expression is the
    296      * right-most <> instance in that expression. If an address expression
    297      * contains no <>, then Postfix guarantees that it contains at most one
    298      * non-comment string; that string is the address part of the address
    299      * expression, so there is no ambiguity.
    300      *
    301      * Finally, we note that stress testing shows that other code in Sendmail
    302      * 8.12.8 bluntly truncates ``text <address>'' to 256 bytes even when
    303      * this means chopping the <address> somewhere in the middle. This is a
    304      * loss of control that we're not entirely comfortable with. However,
    305      * unbalanced quotes and dangling backslash do not seem to influence the
    306      * way that Sendmail parses headers, so this is not an urgent problem.
    307      */
    308 #define MAX_NONADDR_LENGTH 250
    309 
    310 #define RESET_NONADDR_LENGTH { \
    311 	start = VSTRING_LEN(vp); \
    312 	addr = 0; \
    313 	addr_len = 0; \
    314     }
    315 
    316 #define ENFORCE_NONADDR_LENGTH do { \
    317 	if (addr && VSTRING_LEN(vp) - addr_len > start + MAX_NONADDR_LENGTH) \
    318 	    strip_address(vp, start, addr->head); \
    319     } while(0)
    320 
    321     if (flags & TOK822_STR_WIPE)
    322 	VSTRING_RESET(vp);
    323 
    324     if (flags & TOK822_STR_TRNC)
    325 	RESET_NONADDR_LENGTH;
    326 
    327     for (tp = tree; tp; tp = tp->next) {
    328 	switch (tp->type) {
    329 	case ',':
    330 	    if (flags & TOK822_STR_TRNC)
    331 		ENFORCE_NONADDR_LENGTH;
    332 	    VSTRING_ADDCH(vp, tp->type);
    333 	    VSTRING_ADDCH(vp, (flags & TOK822_STR_LINE) ? '\n' : ' ');
    334 	    if (flags & TOK822_STR_TRNC)
    335 		RESET_NONADDR_LENGTH;
    336 	    continue;
    337 
    338 	    /*
    339 	     * XXX In order to correctly externalize an address, it is not
    340 	     * sufficient to quote individual atoms. There are higher-level
    341 	     * rules that say when an address localpart needs to be quoted.
    342 	     * We wing it with the quote_822_local() routine, which ignores
    343 	     * the issue of atoms in the domain part that would need quoting.
    344 	     */
    345 	case TOK822_ADDR:
    346 	    addr = tp;
    347 	    tmp = vstring_alloc(100);
    348 	    tok822_internalize(tmp, tp->head, TOK822_STR_TERM);
    349 	    addr_len = VSTRING_LEN(vp);
    350 	    quote_822_local_flags(vp, vstring_str(tmp),
    351 				  QUOTE_FLAG_8BITCLEAN | QUOTE_FLAG_APPEND);
    352 	    addr_len = VSTRING_LEN(vp) - addr_len;
    353 	    vstring_free(tmp);
    354 	    break;
    355 	case TOK822_ATOM:
    356 	case TOK822_COMMENT:
    357 	    vstring_strcat(vp, vstring_str(tp->vstr));
    358 	    break;
    359 	case TOK822_QSTRING:
    360 	    VSTRING_ADDCH(vp, '"');
    361 	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\"\\\r\n");
    362 	    VSTRING_ADDCH(vp, '"');
    363 	    break;
    364 	case TOK822_DOMLIT:
    365 	    VSTRING_ADDCH(vp, '[');
    366 	    tok822_copy_quoted(vp, vstring_str(tp->vstr), "\\\r\n");
    367 	    VSTRING_ADDCH(vp, ']');
    368 	    break;
    369 	case TOK822_STARTGRP:
    370 	    VSTRING_ADDCH(vp, ':');
    371 	    break;
    372 	case '<':
    373 	    if (tp->next && tp->next->type == '>') {
    374 		addr = tp;
    375 		addr_len = 0;
    376 	    }
    377 	    VSTRING_ADDCH(vp, '<');
    378 	    break;
    379 	default:
    380 	    if (tp->type >= TOK822_MINTOK)
    381 		msg_panic("tok822_externalize: unknown operator %d", tp->type);
    382 	    VSTRING_ADDCH(vp, tp->type);
    383 	}
    384 	if (tok822_append_space(tp))
    385 	    VSTRING_ADDCH(vp, ' ');
    386     }
    387     if (flags & TOK822_STR_TRNC)
    388 	ENFORCE_NONADDR_LENGTH;
    389 
    390     if (flags & TOK822_STR_TERM)
    391 	VSTRING_TERMINATE(vp);
    392     return (vp);
    393 }
    394 
    395 /* tok822_copy_quoted - copy a string while quoting */
    396 
    397 static void tok822_copy_quoted(VSTRING *vp, char *str, char *quote_set)
    398 {
    399     int     ch;
    400 
    401     while ((ch = *(unsigned char *) str++) != 0) {
    402 	if (strchr(quote_set, ch))
    403 	    VSTRING_ADDCH(vp, '\\');
    404 	VSTRING_ADDCH(vp, ch);
    405     }
    406 }
    407 
    408 /* tok822_append_space - see if space is needed after this token */
    409 
    410 static int tok822_append_space(TOK822 *tp)
    411 {
    412     TOK822 *next;
    413 
    414     if (tp == 0 || (next = tp->next) == 0 || tp->owner != 0)
    415 	return (0);
    416     if (tp->type == ',' || tp->type == TOK822_STARTGRP || next->type == '<')
    417 	return (1);
    418 
    419 #define NON_OPERATOR(x) \
    420     (x->type == TOK822_ATOM || x->type == TOK822_QSTRING \
    421      || x->type == TOK822_COMMENT || x->type == TOK822_DOMLIT \
    422      || x->type == TOK822_ADDR)
    423 
    424     return (NON_OPERATOR(tp) && NON_OPERATOR(next));
    425 }
    426 
    427 /* tok822_scan_limit - tokenize string */
    428 
    429 TOK822 *tok822_scan_limit(const char *str, TOK822 **tailp, int tok_count_limit)
    430 {
    431     TOK822 *head = 0;
    432     TOK822 *tail = 0;
    433     TOK822 *tp;
    434     int     ch;
    435     int     tok_count = 0;
    436 
    437     /*
    438      * XXX 2822 new feature: Section 4.1 allows "." to appear in a phrase (to
    439      * allow for forms such as: Johnny B. Goode <johhny (at) domain.org>. I cannot
    440      * handle that at the tokenizer level - it is not context sensitive. And
    441      * to fix this at the parser level requires radical changes to preserve
    442      * white space as part of the token stream. Thanks a lot, people.
    443      */
    444     while ((ch = *(unsigned char *) str++) != 0) {
    445 	if (IS_SPACE_TAB_CR_LF(ch))
    446 	    continue;
    447 	if (ch == '(') {
    448 	    tp = tok822_alloc(TOK822_COMMENT, (char *) 0);
    449 	    str = tok822_comment(tp, str);
    450 	} else if (ch == '[') {
    451 	    tp = tok822_alloc(TOK822_DOMLIT, (char *) 0);
    452 	    COLLECT_SKIP_LAST(tp, str, ch, ch != ']');
    453 	} else if (ch == '"') {
    454 	    tp = tok822_alloc(TOK822_QSTRING, (char *) 0);
    455 	    COLLECT_SKIP_LAST(tp, str, ch, ch != '"');
    456 	} else if (ch != '\\' && strchr(tok822_opchar, ch)) {
    457 	    tp = tok822_alloc(ch, (char *) 0);
    458 	} else {
    459 	    tp = tok822_alloc(TOK822_ATOM, (char *) 0);
    460 	    str -= 1;				/* \ may be first */
    461 	    COLLECT(tp, str, ch, !IS_SPACE_TAB_CR_LF(ch) && !strchr(tok822_opchar, ch));
    462 	    tok822_quote_atom(tp);
    463 	}
    464 	if (head == 0) {
    465 	    head = tail = tp;
    466 	    while (tail->next)
    467 		tail = tail->next;
    468 	} else {
    469 	    tail = tok822_append(tail, tp);
    470 	}
    471 	if (tok_count_limit > 0 && ++tok_count >= tok_count_limit)
    472 	    break;
    473     }
    474     if (tailp)
    475 	*tailp = tail;
    476     return (head);
    477 }
    478 
    479 /* tok822_parse_limit - translate external string to token tree */
    480 
    481 TOK822 *tok822_parse_limit(const char *str, int tok_count_limit)
    482 {
    483     TOK822 *head;
    484     TOK822 *tail;
    485     TOK822 *right;
    486     TOK822 *first_token;
    487     TOK822 *last_token;
    488     TOK822 *tp;
    489     int     state;
    490 
    491     /*
    492      * First, tokenize the string, from left to right. We are not allowed to
    493      * throw away any information that we do not understand. With a flat
    494      * token list that contains all tokens, we can always convert back to
    495      * string form.
    496      */
    497     if ((first_token = tok822_scan_limit(str, &last_token, tok_count_limit)) == 0)
    498 	return (0);
    499 
    500     /*
    501      * For convenience, sandwich the token list between two sentinel tokens.
    502      */
    503 #define GLUE(left,rite) { left->next = rite; rite->prev = left; }
    504 
    505     head = tok822_alloc(0, (char *) 0);
    506     GLUE(head, first_token);
    507     tail = tok822_alloc(0, (char *) 0);
    508     GLUE(last_token, tail);
    509 
    510     /*
    511      * Next step is to transform the token list into a parse tree. This is
    512      * done most conveniently from right to left. If there is something that
    513      * we do not understand, just leave it alone, don't throw it away. The
    514      * address information that we're looking for sits in-between the current
    515      * node (tp) and the one called right. Add missing commas on the fly.
    516      */
    517     state = DO_WORD;
    518     right = tail;
    519     tp = tail->prev;
    520     while (tp->type) {
    521 	if (tp->type == TOK822_COMMENT) {	/* move comment to the side */
    522 	    MOVE_COMMENT_AND_CONTINUE(tp, right);
    523 	} else if (tp->type == ';') {		/* rh side of named group */
    524 	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
    525 	    state = DO_GROUP | DO_WORD;
    526 	} else if (tp->type == ':' && (state & DO_GROUP) != 0) {
    527 	    tp->type = TOK822_STARTGRP;
    528 	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
    529 	    SKIP(tp, tp->type != ',');
    530 	    right = tp;
    531 	    continue;
    532 	} else if (tp->type == '>') {		/* rh side of <route> */
    533 	    right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA);
    534 	    SKIP_MOVE_COMMENT(tp, tp->type != '<', right);
    535 	    (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
    536 	    SKIP(tp, tp->type > 0xff || strchr(">;,:", tp->type) == 0);
    537 	    right = tp;
    538 	    state |= DO_WORD;
    539 	    continue;
    540 	} else if (tp->type == TOK822_ATOM || tp->type == TOK822_QSTRING
    541 		   || tp->type == TOK822_DOMLIT) {
    542 	    if ((state & DO_WORD) == 0)
    543 		right = tok822_group(TOK822_ADDR, tp, right, ADD_COMMA)->next;
    544 	    state &= ~DO_WORD;
    545 	} else if (tp->type == ',') {
    546 	    right = tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
    547 	    state |= DO_WORD;
    548 	} else {
    549 	    state |= DO_WORD;
    550 	}
    551 	tp = tp->prev;
    552     }
    553     (void) tok822_group(TOK822_ADDR, tp, right, NO_MISSING_COMMA);
    554 
    555     /*
    556      * Discard the sentinel tokens on the left and right extremes. Properly
    557      * terminate the resulting list.
    558      */
    559     tp = (head->next != tail ? head->next : 0);
    560     tok822_cut_before(head->next);
    561     tok822_free(head);
    562     tok822_cut_before(tail);
    563     tok822_free(tail);
    564     return (tp);
    565 }
    566 
    567 /* tok822_quote_atom - see if an atom needs quoting when externalized */
    568 
    569 static void tok822_quote_atom(TOK822 *tp)
    570 {
    571     char   *cp;
    572     int     ch;
    573 
    574     /*
    575      * RFC 822 expects 7-bit data. Rather than quoting every 8-bit character
    576      * (and still passing it on as 8-bit data) we leave 8-bit data alone.
    577      */
    578     for (cp = vstring_str(tp->vstr); (ch = *(unsigned char *) cp) != 0; cp++) {
    579 	if ( /* !ISASCII(ch) || */ ch == ' '
    580 	    || ISCNTRL(ch) || strchr(tok822_opchar, ch)) {
    581 	    tp->type = TOK822_QSTRING;
    582 	    break;
    583 	}
    584     }
    585 }
    586 
    587 /* tok822_comment - tokenize comment */
    588 
    589 static const char *tok822_comment(TOK822 *tp, const char *str)
    590 {
    591     int     level = 1;
    592     int     ch;
    593 
    594     /*
    595      * XXX We cheat by storing comments in their external form. Otherwise it
    596      * would be a royal pain to preserve \ before (. That would require a
    597      * recursive parser; the easy to implement stack-based recursion would be
    598      * too expensive.
    599      */
    600     VSTRING_ADDCH(tp->vstr, '(');
    601 
    602     while ((ch = *(unsigned char *) str) != 0) {
    603 	VSTRING_ADDCH(tp->vstr, ch);
    604 	str++;
    605 	if (ch == '(') {			/* comments can nest! */
    606 	    level++;
    607 	} else if (ch == ')') {
    608 	    if (--level == 0)
    609 		break;
    610 	} else if (ch == '\\') {
    611 	    if ((ch = *(unsigned char *) str) == 0)
    612 		break;
    613 	    VSTRING_ADDCH(tp->vstr, ch);
    614 	    str++;
    615 	}
    616     }
    617     VSTRING_TERMINATE(tp->vstr);
    618     return (str);
    619 }
    620 
    621 /* tok822_group - cluster a group of tokens */
    622 
    623 static TOK822 *tok822_group(int group_type, TOK822 *left, TOK822 *right, int sync_type)
    624 {
    625     TOK822 *group;
    626     TOK822 *sync;
    627     TOK822 *first;
    628 
    629     /*
    630      * Cluster the tokens between left and right under their own parse tree
    631      * node. Optionally insert a resync token.
    632      */
    633     if (left != right && (first = left->next) != right) {
    634 	tok822_cut_before(right);
    635 	tok822_cut_before(first);
    636 	group = tok822_alloc(group_type, (char *) 0);
    637 	tok822_sub_append(group, first);
    638 	tok822_append(left, group);
    639 	tok822_append(group, right);
    640 	if (sync_type) {
    641 	    sync = tok822_alloc(sync_type, (char *) 0);
    642 	    tok822_append(left, sync);
    643 	}
    644     }
    645     return (left);
    646 }
    647 
    648 /* tok822_scan_addr - convert external address string to address token */
    649 
    650 TOK822 *tok822_scan_addr(const char *addr)
    651 {
    652     TOK822 *tree = tok822_alloc(TOK822_ADDR, (char *) 0);
    653 
    654     tree->head = tok822_scan(addr, &tree->tail);
    655     return (tree);
    656 }
    657 
    658 #ifdef TEST
    659 
    660 #include <unistd.h>
    661 #include <vstream.h>
    662 #include <readlline.h>
    663 
    664 /* tok822_print - display token */
    665 
    666 static void tok822_print(TOK822 *list, int indent)
    667 {
    668     TOK822 *tp;
    669 
    670     for (tp = list; tp; tp = tp->next) {
    671 	if (tp->type < TOK822_MINTOK) {
    672 	    vstream_printf("%*s %s \"%c\"\n", indent, "", "OP", tp->type);
    673 	} else if (tp->type == TOK822_ADDR) {
    674 	    vstream_printf("%*s %s\n", indent, "", "address");
    675 	    tok822_print(tp->head, indent + 2);
    676 	} else if (tp->type == TOK822_STARTGRP) {
    677 	    vstream_printf("%*s %s\n", indent, "", "group \":\"");
    678 	} else {
    679 	    vstream_printf("%*s %s \"%s\"\n", indent, "",
    680 			   tp->type == TOK822_COMMENT ? "comment" :
    681 			   tp->type == TOK822_ATOM ? "atom" :
    682 			   tp->type == TOK822_QSTRING ? "quoted string" :
    683 			   tp->type == TOK822_DOMLIT ? "domain literal" :
    684 			   tp->type == TOK822_ADDR ? "address" :
    685 			   "unknown\n", vstring_str(tp->vstr));
    686 	}
    687     }
    688 }
    689 
    690 int     main(int unused_argc, char **unused_argv)
    691 {
    692     VSTRING *vp = vstring_alloc(100);
    693     TOK822 *list;
    694     VSTRING *buf = vstring_alloc(100);
    695 
    696 #define TEST_TOKEN_LIMIT 20
    697 
    698     while (readlline(buf, VSTREAM_IN, (int *) 0)) {
    699 	while (VSTRING_LEN(buf) > 0 && vstring_end(buf)[-1] == '\n') {
    700 	    vstring_end(buf)[-1] = 0;
    701 	    vstring_truncate(buf, VSTRING_LEN(buf) - 1);
    702 	}
    703 	if (!isatty(vstream_fileno(VSTREAM_IN)))
    704 	    vstream_printf(">>>%s<<<\n\n", vstring_str(buf));
    705 	list = tok822_parse_limit(vstring_str(buf), TEST_TOKEN_LIMIT);
    706 	vstream_printf("Parse tree:\n");
    707 	tok822_print(list, 0);
    708 	vstream_printf("\n");
    709 
    710 	vstream_printf("Internalized:\n%s\n\n",
    711 		vstring_str(tok822_internalize(vp, list, TOK822_STR_DEFL)));
    712 	vstream_fflush(VSTREAM_OUT);
    713 	vstream_printf("Externalized, no newlines inserted:\n%s\n\n",
    714 		       vstring_str(tok822_externalize(vp, list,
    715 				       TOK822_STR_DEFL | TOK822_STR_TRNC)));
    716 	vstream_fflush(VSTREAM_OUT);
    717 	vstream_printf("Externalized, newlines inserted:\n%s\n\n",
    718 		       vstring_str(tok822_externalize(vp, list,
    719 		     TOK822_STR_DEFL | TOK822_STR_LINE | TOK822_STR_TRNC)));
    720 	vstream_fflush(VSTREAM_OUT);
    721 	tok822_free_tree(list);
    722     }
    723     vstring_free(vp);
    724     vstring_free(buf);
    725     return (0);
    726 }
    727 
    728 #endif
    729