Home | History | Annotate | Line # | Download | only in mail
      1 /*	$NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Anon Ymous.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 
     33 /*
     34  * This module contains the core MIME header decoding routines.
     35  * Please refer to RFC 2047 and RFC 2822.
     36  */
     37 
     38 #ifdef MIME_SUPPORT
     39 
     40 #include <sys/cdefs.h>
     41 #ifndef __lint__
     42 __RCSID("$NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $");
     43 #endif /* not __lint__ */
     44 
     45 #include <assert.h>
     46 #include <stdio.h>
     47 #include <stdlib.h>
     48 #include <string.h>
     49 
     50 #include "def.h"
     51 #include "extern.h"
     52 #include "mime.h"
     53 #include "mime_header.h"
     54 #include "mime_codecs.h"
     55 
     56 static const char *
     57 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
     58 {
     59 	char *q;
     60 	q = from_cs;
     61 	for (/*EMPTY*/; *p != '?'; p++) {
     62 		if (*p == '\0' || q >= from_cs + from_cs_len - 1)
     63 			return NULL;
     64 		*q++ = *p;
     65 	}
     66 	*q = '\0';
     67 	return ++p;	/* if here, then we got the '?' */
     68 }
     69 
     70 /*
     71  * An encoded word is a string of at most 75 non-white space
     72  * characters of the following form:
     73  *
     74  *  =?charset?X?encoding?=
     75  *
     76  * where:
     77  *   'charset'	is the original character set of the unencoded string.
     78  *
     79  *   'X'	is the encoding type 'B' or 'Q' for "base64" or
     80  *              "quoted-printable", respectively,
     81  *   'encoding'	is the encoded string.
     82  *
     83  * Both 'charset' and 'X' are case independent and 'encoding' cannot
     84  * contain any whitespace or '?' characters.  The 'encoding' must also
     85  * be fully contained within the encoded words, i.e., it cannot be
     86  * split between encoded words.
     87  *
     88  * Note: the 'B' encoding is a slightly modified "quoted-printable"
     89  * encoding.  In particular, spaces (' ') may be encoded as '_' to
     90  * improve undecoded readability.
     91  */
     92 static int
     93 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
     94 {
     95 	ssize_t declen;
     96 	size_t enclen, dstlen;
     97 	char decword[LINESIZE];
     98 	char from_cs[LINESIZE];
     99 	const char *encword, *iend, *p;
    100 	char *dstend;
    101 	char enctype;
    102 
    103 	p = *ibuf;
    104 	if (p[0] != '=' && p[1] != '?')
    105 		return -1;
    106 	if (strlen(p) <  2 + 1 + 3 + 1 + 2)
    107 		return -1;
    108 	p = grab_charset(from_cs, sizeof(from_cs), p + 2);
    109 	if (p == NULL)
    110 		return -1;
    111 	enctype = *p++;
    112 	if (*p++ != '?')
    113 		return -1;
    114 	encword = p;
    115 	p = strchr(p, '?');
    116 	if (p == NULL || p[1] != '=')
    117 		return -1;
    118 	enclen = p - encword;	/* length of encoded substring */
    119 	iend = p + 2;
    120 	/* encoded words are at most 75 characters (RFC 2047, sec 2) */
    121 	if (iend > *ibuf + 75)
    122 		return -1;
    123 
    124 	if (oend < *obuf + 1) {
    125 		assert(/*CONSTCOND*/ 0);	/* We have a coding error! */
    126 		return -1;
    127 	}
    128 	dstend = to_cs ? decword : *obuf;
    129 	dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;
    130 
    131 	declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen);
    132 	if (declen == -1)
    133 		return -1;
    134 
    135 	dstend += declen;
    136 #ifdef CHARSET_SUPPORT
    137 	if (to_cs != NULL) {
    138 		iconv_t cd;
    139 		const char *src;
    140 		size_t srclen;
    141 		size_t cnt;
    142 
    143 		cd = iconv_open(to_cs, from_cs);
    144 		if (cd == (iconv_t)-1)
    145 			return -1;
    146 
    147 		src = decword;
    148 		srclen = declen;
    149 		dstend = *obuf;
    150 		dstlen = oend - *obuf - 1;
    151 		cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
    152 
    153 		(void)iconv_close(cd);
    154 		if (cnt == (size_t)-1)
    155 			return -1;
    156 	}
    157 #endif /* CHARSET_SUPPORT */
    158 	*dstend = '\0';
    159 	*ibuf = iend;
    160 	*obuf = dstend;
    161 	return 0;
    162 }
    163 
    164 
    165 /*
    166  * Folding White Space.  See RFC 2822.
    167  *
    168  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
    169  * pairs (i.e., "\r\n") and never separately.  However, by the time
    170  * mail(1) sees the messages, all CRLF pairs have been converted to
    171  * '\n' characters.
    172  *
    173  * XXX - pull is_FWS() and skip_FWS() up to def.h?
    174  */
    175 static inline int
    176 is_FWS(int c)
    177 {
    178 	return c == ' ' || c == '\t' || c == '\n';
    179 }
    180 
    181 static inline const char *
    182 skip_FWS(const char *p)
    183 {
    184 	while (is_FWS(*p))
    185 		p++;
    186 	return p;
    187 }
    188 
    189 static inline void
    190 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
    191 {
    192 	const char *p, *pend;
    193 	char *q, *qend;
    194 
    195 	p = *src;
    196 	q = *dst;
    197 	pend = srcend;
    198 	qend = dstend;
    199 
    200 	if (p) {  /* copy any skipped linear-white-space */
    201 		while (p < pend && q < qend)
    202 			*q++ = *p++;
    203 		*dst = q;
    204 		*src = NULL;
    205 	}
    206 }
    207 
    208 /*
    209  * Decode an unstructured field.
    210  *
    211  * See RFC 2822 Sec 2.2.1 and 3.6.5.
    212  * Encoded words may occur anywhere in unstructured fields provided
    213  * they are separated from any other text or encoded words by at least
    214  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
    215  * encoded words occur sequentially (separated by only FWS) then the
    216  * separating FWS is removed.
    217  *
    218  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
    219  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
    220  * (or any non-whitespace character) immediately before an
    221  * encoded-word will prevent it from being decoded.
    222  *
    223  * hstring should be a NULL terminated string.
    224  * outbuf should be sufficiently large to hold the result.
    225  */
    226 static void
    227 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
    228 {
    229 	const char *p, *p0;
    230 	char *q, *qend;
    231 	int lastc;
    232 	const char *charset;
    233 
    234 	charset = value(ENAME_MIME_CHARSET);
    235 	qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
    236 	q = outbuf;
    237 	p = hstring;
    238 	p0 = NULL;
    239 	lastc = (unsigned char)' ';
    240 	while (*p && q < qend) {
    241 		const char *p1;
    242 		char *q1;
    243 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
    244 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
    245 		    (*p1 == '\0' || is_FWS(*p1))) {
    246 			p0 = p1;  /* pointer to first character after encoded word */
    247 			q = q1;
    248 			p = skip_FWS(p1);
    249 			lastc = (unsigned char)*p0;
    250 		}
    251 		else {
    252 			copy_skipped_FWS(&q, qend, &p0, p);
    253 			lastc = (unsigned char)*p;
    254 			if (q < qend)
    255 				*q++ = *p++;
    256 		}
    257 	}
    258 	copy_skipped_FWS(&q, qend, &p0, p);
    259 	*q = '\0';
    260 }
    261 
    262 /*
    263  * Decode a field comment.
    264  *
    265  * Comments only occur in structured fields, can be nested (rfc 2822,
    266  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
    267  * Otherwise, they can be regarded as unstructured fields that are
    268  * bounded by '(' and ')' characters.
    269  */
    270 static int
    271 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
    272 {
    273 	const char *p, *pend, *p0;
    274 	char *q, *qend;
    275 	int lastc;
    276 
    277 	p = *ibuf;
    278 	q = *obuf;
    279 	pend = iend;
    280 	qend = oend;
    281 	lastc = ' ';
    282 	p0 = NULL;
    283 	while (p < pend && q < qend) {
    284 		const char *p1;
    285 		char *q1;
    286 
    287 		if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
    288 		    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
    289 		    (*p1 == ')' || is_FWS(*p1))) {
    290 			lastc = (unsigned char)*p1;
    291 			p0 = p1;
    292 			q = q1;
    293 			p = skip_FWS(p1);
    294 			/*
    295 			 * XXX - this check should be unnecessary as *pend should
    296 			 * be '\0' which will stop skip_FWS()
    297 			 */
    298 			if (p > pend)
    299 				p = pend;
    300 		}
    301 		else {
    302 			copy_skipped_FWS(&q, qend, &p0, p);
    303 			if (q >= qend)	/* XXX - q > qend cannot happen */
    304 				break;
    305 
    306 			if (*p == ')') {
    307 				*q++ = *p++;	/* copy the closing ')' */
    308 				break;		/* and get out of here! */
    309 			}
    310 
    311 			if (*p == '(') {
    312 				*q++ = *p++;	/* copy the opening '(' */
    313 				if (decode_comment(&q, qend, &p, pend, charset) == -1)
    314 					return -1;	/* is this right or should we update? */
    315 				lastc = ')';
    316 			}
    317 			else if (*p == '\\' && p + 1 < pend) {	/* quoted-pair */
    318 				if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
    319 					*q++ = *p;
    320 				p++;
    321 				lastc = (unsigned char)*p;
    322 				if (q < qend)
    323 					*q++ = *p++;
    324 			}
    325 			else {
    326 				lastc = (unsigned char)*p;
    327 				*q++ = *p++;
    328 			}
    329 		}
    330 	}
    331 	*ibuf = p;
    332 	*obuf = q;
    333 	return 0;
    334 }
    335 
    336 /*
    337  * Decode a quoted-string or no-fold-quote.
    338  *
    339  * These cannot contain encoded words.  They can contain quoted-pairs,
    340  * making '\\' special.  They have no other structure.  See RFC 2822
    341  * sec 3.2.5 and 3.6.4.
    342  */
    343 static void
    344 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
    345 {
    346 	const char *p, *pend;
    347 	char *q, *qend;
    348 
    349 	qend = oend;
    350 	pend = iend;
    351 	p = *ibuf;
    352 	q = *obuf;
    353 	while (p < pend && q < qend) {
    354 		if (*p == '"') {
    355 			*q++ = *p++;	/* copy the closing '"' */
    356 			break;
    357 		}
    358 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
    359 			if (p[1] == '"' || p[1] == '\\') {
    360 				*q++ = *p;
    361 				if (q >= qend)
    362 					break;
    363 			}
    364 			p++;
    365 		}
    366 		*q++ = *p++;
    367 	}
    368 	*ibuf = p;
    369 	*obuf = q;
    370 }
    371 
    372 /*
    373  * Decode a domain-literal or no-fold-literal.
    374  *
    375  * These cannot contain encoded words.  They can have quoted pairs and
    376  * are delimited by '[' and ']' making '\\', '[', and ']' special.
    377  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
    378  */
    379 static void
    380 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
    381 {
    382 	const char *p, *pend;
    383 	char *q, *qend;
    384 
    385 	qend = oend;
    386 	pend = iend;
    387 	p = *ibuf;
    388 	q = *obuf;
    389 	while (p < pend && q < qend) {
    390 		if (*p == ']') {
    391 			*q++ = *p++;	/* copy the closing ']' */
    392 			break;
    393 		}
    394 		if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
    395 			if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
    396 				*q++ = *p;
    397 				if (q >= qend)
    398 					break;
    399 			}
    400 			p++;
    401 		}
    402 		*q++ = *p++;
    403 	}
    404 	*ibuf = p;
    405 	*obuf = q;
    406 }
    407 
    408 /*
    409  * Specials: see RFC 2822 sec 3.2.1.
    410  */
    411 static inline int
    412 is_specials(int c)
    413 {
    414 	static const char specialtab[] = {
    415 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    416 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    417 		0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
    418 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
    419 
    420 		1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    421 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
    422 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    423 		0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
    424 	};
    425 	return !(c & ~0x7f) ? specialtab[c] : 0;
    426 }
    427 
    428 /*
    429  * Decode a structured field.
    430  *
    431  * At the top level, structured fields can only contain encoded-words
    432  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
    433  */
    434 static void
    435 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
    436 {
    437 	const char *p, *pend, *p0;
    438 	char *q, *qend;
    439 	const char *charset;
    440 	int lastc;
    441 
    442 	charset = value(ENAME_MIME_CHARSET);
    443 
    444 	p = hstring;
    445 	q = linebuf;
    446 	pend = hstring + strlen(hstring);
    447 	qend = linebuf + bufsize - 1;	/* save room for the NULL terminator */
    448 	lastc = (unsigned char)' ';
    449 	p0 = NULL;
    450 	while (p < pend && q < qend) {
    451 		const char *p1;
    452 		char *q1;
    453 
    454 		if (*p != '=') {
    455 			copy_skipped_FWS(&q, qend, &p0, p);
    456 			if (q >= qend)
    457 				break;
    458 		}
    459 
    460 		switch (*p) {
    461 		case '(':	/* start of comment */
    462 			*q++ = *p++;	/* copy the opening '(' */
    463 			(void)decode_comment(&q, qend, &p, pend, charset);
    464 			lastc = (unsigned char)p[-1];
    465 			break;
    466 
    467 		case '"':	/* start of quoted-string or no-fold-quote */
    468 			*q++ = *p++;	/* copy the opening '"' */
    469 			decode_quoted_string(&q, qend, &p, pend);
    470 			lastc = (unsigned char)p[-1];
    471 			break;
    472 
    473 		case '[':	/* start of domain-literal or no-fold-literal */
    474 			*q++ = *p++;	/* copy the opening '[' */
    475 			decode_domain_literal(&q, qend, &p, pend);
    476 			lastc = (unsigned char)p[-1];
    477 			break;
    478 
    479 		case '\\':	/* start of quoted-pair */
    480 			if (p + 1 < pend) {		/* quoted pair */
    481 				if (is_specials(p[1])) {
    482 					*q++ = *p;
    483 					if (q >= qend)
    484 						break;
    485 				}
    486 				p++;	/* skip the '\\' */
    487 			}
    488 			goto copy_char;
    489 
    490 		case '=':
    491 			/*
    492 			 * At this level encoded words can appear via
    493 			 * 'phrases' (possibly delimited by ',' as in
    494 			 * 'keywords').  Thus we handle them as such.
    495 			 * Hopefully this is sufficient.
    496 			 */
    497 			if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
    498 			    decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
    499 			    (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
    500 				lastc = (unsigned char)*p1;
    501 				p0 = p1;
    502 				q = q1;
    503 				p = skip_FWS(p1);
    504 				/*
    505 				 * XXX - this check should be
    506 				 * unnecessary as *pend should be '\0'
    507 				 * which will stop skip_FWS()
    508 				 */
    509 				if (p > pend)
    510 					p = pend;
    511 				break;
    512 			}
    513 			else {
    514 				copy_skipped_FWS(&q, qend, &p0, p);
    515 				if (q >= qend)
    516 					break;
    517 				goto copy_char;
    518 			}
    519 
    520 		case '<':	/* start of angle-addr, msg-id, or path. */
    521 			/*
    522 			 * A msg-id cannot contain encoded-pairs or
    523 			 * encoded-words, but angle-addr and path can.
    524 			 * Distinguishing between them seems to be
    525 			 * unnecessary, so let's be loose and just
    526 			 * decode them as if they were all the same.
    527 			 */
    528 		default:
    529 	copy_char:
    530 			lastc = (unsigned char)*p;
    531 			*q++ = *p++;
    532 			break;
    533 		}
    534 	}
    535 	copy_skipped_FWS(&q, qend, &p0, p);
    536 	*q = '\0';	/* null terminate the result! */
    537 }
    538 
    539 /*
    540  * Returns the correct hfield decoder, or NULL if none.
    541  * Info extracted from RFC 2822.
    542  *
    543  * name - pointer to field name of header line (with colon).
    544  */
    545 PUBLIC hfield_decoder_t
    546 mime_hfield_decoder(const char *name)
    547 {
    548 	static const struct field_decoder_tbl_s {
    549 		const char *field_name;
    550 		size_t field_len;
    551 		hfield_decoder_t decoder;
    552 	} field_decoder_tbl[] = {
    553 #define X(s)	s, sizeof(s) - 1
    554 		{ X("Received:"),			NULL },
    555 
    556 		{ X("Content-Type:"),			NULL },
    557 		{ X("Content-Disposition:"),		NULL },
    558 		{ X("Content-Transfer-Encoding:"),	NULL },
    559 		{ X("Content-Description:"),		mime_decode_sfield },
    560 		{ X("Content-ID:"),			mime_decode_sfield },
    561 		{ X("MIME-Version:"),			mime_decode_sfield },
    562 
    563 		{ X("Bcc:"),				mime_decode_sfield },
    564 		{ X("Cc:"),				mime_decode_sfield },
    565 		{ X("Date:"),				mime_decode_sfield },
    566 		{ X("From:"),				mime_decode_sfield },
    567 		{ X("In-Reply-To:"),			mime_decode_sfield },
    568 		{ X("Keywords:"),			mime_decode_sfield },
    569 		{ X("Message-ID:"),			mime_decode_sfield },
    570 		{ X("References:"),			mime_decode_sfield },
    571 		{ X("Reply-To:"),			mime_decode_sfield },
    572 		{ X("Return-Path:"),			mime_decode_sfield },
    573 		{ X("Sender:"),				mime_decode_sfield },
    574 		{ X("To:"),				mime_decode_sfield },
    575 		{ X("Subject:"),			mime_decode_usfield },
    576 		{ X("Comments:"),			mime_decode_usfield },
    577 		{ X("X-"),				mime_decode_usfield },
    578 		{ NULL, 0,				mime_decode_usfield },	/* optional-fields */
    579 #undef X
    580 	};
    581 	const struct field_decoder_tbl_s *fp;
    582 
    583 	/* XXX - this begs for a hash table! */
    584 	for (fp = field_decoder_tbl; fp->field_name; fp++)
    585 		if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
    586 			break;
    587 	return fp->decoder;
    588 }
    589 
    590 #endif /* MIME_SUPPORT */
    591