Home | History | Annotate | Line # | Download | only in libedit
tokenizer.c revision 1.25
      1 /*	$NetBSD: tokenizer.c,v 1.25 2016/04/11 00:22:48 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1992, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Christos Zoulas of Cornell University.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. Neither the name of the University nor the names of its contributors
     19  *    may be used to endorse or promote products derived from this software
     20  *    without specific prior written permission.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32  * SUCH DAMAGE.
     33  */
     34 
     35 #include "config.h"
     36 #if !defined(lint) && !defined(SCCSID)
     37 #if 0
     38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
     39 #else
     40 __RCSID("$NetBSD: tokenizer.c,v 1.25 2016/04/11 00:22:48 christos Exp $");
     41 #endif
     42 #endif /* not lint && not SCCSID */
     43 
     44 /* We build this file twice, once as NARROW, once as WIDE. */
     45 /*
     46  * tokenize.c: Bourne shell like tokenizer
     47  */
     48 #include <stdlib.h>
     49 #include <string.h>
     50 
     51 #include "histedit.h"
     52 #include "chartype.h"
     53 
     54 typedef enum {
     55 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
     56 } quote_t;
     57 
     58 #define	TOK_KEEP	1
     59 #define	TOK_EAT		2
     60 
     61 #define	WINCR		20
     62 #define	AINCR		10
     63 
     64 #define	IFS		STR("\t \n")
     65 
     66 #define	tok_malloc(a)		malloc(a)
     67 #define	tok_free(a)		free(a)
     68 #define	tok_realloc(a, b)	realloc(a, b)
     69 
     70 #ifdef NARROWCHAR
     71 #define	FUN(prefix, rest)	prefix ## _ ## rest
     72 #define	TYPE(type)		type
     73 #define	STR(x)			x
     74 #define	Strchr(s, c)		strchr(s, c)
     75 #define	tok_strdup(s)		strdup(s)
     76 #else
     77 #define	FUN(prefix, rest)	prefix ## _w ## rest
     78 #define	TYPE(type)		type ## W
     79 #define	STR(x)			L ## x
     80 #define	Strchr(s, c)		wcschr(s, c)
     81 #define	tok_strdup(s)		wcsdup(s)
     82 #endif
     83 
     84 struct TYPE(tokenizer) {
     85 	Char	*ifs;		/* In field separator			 */
     86 	size_t	 argc, amax;	/* Current and maximum number of args	 */
     87 	Char   **argv;		/* Argument list			 */
     88 	Char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
     89 	Char	*wstart;	/* Beginning of next word		 */
     90 	Char	*wspace;	/* Space of word buffer			 */
     91 	quote_t	 quote;		/* Quoting state			 */
     92 	int	 flags;		/* flags;				 */
     93 };
     94 
     95 
     96 private void FUN(tok,finish)(TYPE(Tokenizer) *);
     97 
     98 
     99 /* FUN(tok,finish)():
    100  *	Finish a word in the tokenizer.
    101  */
    102 private void
    103 FUN(tok,finish)(TYPE(Tokenizer) *tok)
    104 {
    105 
    106 	*tok->wptr = '\0';
    107 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
    108 		tok->argv[tok->argc++] = tok->wstart;
    109 		tok->argv[tok->argc] = NULL;
    110 		tok->wstart = ++tok->wptr;
    111 	}
    112 	tok->flags &= ~TOK_KEEP;
    113 }
    114 
    115 
    116 /* FUN(tok,init)():
    117  *	Initialize the tokenizer
    118  */
    119 public TYPE(Tokenizer) *
    120 FUN(tok,init)(const Char *ifs)
    121 {
    122 	TYPE(Tokenizer) *tok = tok_malloc(sizeof(*tok));
    123 
    124 	if (tok == NULL)
    125 		return NULL;
    126 	tok->ifs = tok_strdup(ifs ? ifs : IFS);
    127 	if (tok->ifs == NULL) {
    128 		tok_free(tok);
    129 		return NULL;
    130 	}
    131 	tok->argc = 0;
    132 	tok->amax = AINCR;
    133 	tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
    134 	if (tok->argv == NULL) {
    135 		tok_free(tok->ifs);
    136 		tok_free(tok);
    137 		return NULL;
    138 	}
    139 	tok->argv[0] = NULL;
    140 	tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
    141 	if (tok->wspace == NULL) {
    142 		tok_free(tok->argv);
    143 		tok_free(tok->ifs);
    144 		tok_free(tok);
    145 		return NULL;
    146 	}
    147 	tok->wmax = tok->wspace + WINCR;
    148 	tok->wstart = tok->wspace;
    149 	tok->wptr = tok->wspace;
    150 	tok->flags = 0;
    151 	tok->quote = Q_none;
    152 
    153 	return tok;
    154 }
    155 
    156 
    157 /* FUN(tok,reset)():
    158  *	Reset the tokenizer
    159  */
    160 public void
    161 FUN(tok,reset)(TYPE(Tokenizer) *tok)
    162 {
    163 
    164 	tok->argc = 0;
    165 	tok->wstart = tok->wspace;
    166 	tok->wptr = tok->wspace;
    167 	tok->flags = 0;
    168 	tok->quote = Q_none;
    169 }
    170 
    171 
    172 /* FUN(tok,end)():
    173  *	Clean up
    174  */
    175 public void
    176 FUN(tok,end)(TYPE(Tokenizer) *tok)
    177 {
    178 
    179 	tok_free(tok->ifs);
    180 	tok_free(tok->wspace);
    181 	tok_free(tok->argv);
    182 	tok_free(tok);
    183 }
    184 
    185 
    186 
    187 /* FUN(tok,line)():
    188  *	Bourne shell (sh(1)) like tokenizing
    189  *	Arguments:
    190  *		tok	current tokenizer state (setup with FUN(tok,init)())
    191  *		line	line to parse
    192  *	Returns:
    193  *		-1	Internal error
    194  *		 3	Quoted return
    195  *		 2	Unmatched double quote
    196  *		 1	Unmatched single quote
    197  *		 0	Ok
    198  *	Modifies (if return value is 0):
    199  *		argc	number of arguments
    200  *		argv	argument array
    201  *		cursorc	if !NULL, argv element containing cursor
    202  *		cursorv	if !NULL, offset in argv[cursorc] of cursor
    203  */
    204 public int
    205 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
    206     int *argc, const Char ***argv, int *cursorc, int *cursoro)
    207 {
    208 	const Char *ptr;
    209 	int cc, co;
    210 
    211 	cc = co = -1;
    212 	ptr = line->buffer;
    213 	for (ptr = line->buffer; ;ptr++) {
    214 		if (ptr >= line->lastchar)
    215 			ptr = STR("");
    216 		if (ptr == line->cursor) {
    217 			cc = (int)tok->argc;
    218 			co = (int)(tok->wptr - tok->wstart);
    219 		}
    220 		switch (*ptr) {
    221 		case '\'':
    222 			tok->flags |= TOK_KEEP;
    223 			tok->flags &= ~TOK_EAT;
    224 			switch (tok->quote) {
    225 			case Q_none:
    226 				tok->quote = Q_single;	/* Enter single quote
    227 							 * mode */
    228 				break;
    229 
    230 			case Q_single:	/* Exit single quote mode */
    231 				tok->quote = Q_none;
    232 				break;
    233 
    234 			case Q_one:	/* Quote this ' */
    235 				tok->quote = Q_none;
    236 				*tok->wptr++ = *ptr;
    237 				break;
    238 
    239 			case Q_double:	/* Stay in double quote mode */
    240 				*tok->wptr++ = *ptr;
    241 				break;
    242 
    243 			case Q_doubleone:	/* Quote this ' */
    244 				tok->quote = Q_double;
    245 				*tok->wptr++ = *ptr;
    246 				break;
    247 
    248 			default:
    249 				return -1;
    250 			}
    251 			break;
    252 
    253 		case '"':
    254 			tok->flags &= ~TOK_EAT;
    255 			tok->flags |= TOK_KEEP;
    256 			switch (tok->quote) {
    257 			case Q_none:	/* Enter double quote mode */
    258 				tok->quote = Q_double;
    259 				break;
    260 
    261 			case Q_double:	/* Exit double quote mode */
    262 				tok->quote = Q_none;
    263 				break;
    264 
    265 			case Q_one:	/* Quote this " */
    266 				tok->quote = Q_none;
    267 				*tok->wptr++ = *ptr;
    268 				break;
    269 
    270 			case Q_single:	/* Stay in single quote mode */
    271 				*tok->wptr++ = *ptr;
    272 				break;
    273 
    274 			case Q_doubleone:	/* Quote this " */
    275 				tok->quote = Q_double;
    276 				*tok->wptr++ = *ptr;
    277 				break;
    278 
    279 			default:
    280 				return -1;
    281 			}
    282 			break;
    283 
    284 		case '\\':
    285 			tok->flags |= TOK_KEEP;
    286 			tok->flags &= ~TOK_EAT;
    287 			switch (tok->quote) {
    288 			case Q_none:	/* Quote next character */
    289 				tok->quote = Q_one;
    290 				break;
    291 
    292 			case Q_double:	/* Quote next character */
    293 				tok->quote = Q_doubleone;
    294 				break;
    295 
    296 			case Q_one:	/* Quote this, restore state */
    297 				*tok->wptr++ = *ptr;
    298 				tok->quote = Q_none;
    299 				break;
    300 
    301 			case Q_single:	/* Stay in single quote mode */
    302 				*tok->wptr++ = *ptr;
    303 				break;
    304 
    305 			case Q_doubleone:	/* Quote this \ */
    306 				tok->quote = Q_double;
    307 				*tok->wptr++ = *ptr;
    308 				break;
    309 
    310 			default:
    311 				return -1;
    312 			}
    313 			break;
    314 
    315 		case '\n':
    316 			tok->flags &= ~TOK_EAT;
    317 			switch (tok->quote) {
    318 			case Q_none:
    319 				goto tok_line_outok;
    320 
    321 			case Q_single:
    322 			case Q_double:
    323 				*tok->wptr++ = *ptr;	/* Add the return */
    324 				break;
    325 
    326 			case Q_doubleone:   /* Back to double, eat the '\n' */
    327 				tok->flags |= TOK_EAT;
    328 				tok->quote = Q_double;
    329 				break;
    330 
    331 			case Q_one:	/* No quote, more eat the '\n' */
    332 				tok->flags |= TOK_EAT;
    333 				tok->quote = Q_none;
    334 				break;
    335 
    336 			default:
    337 				return 0;
    338 			}
    339 			break;
    340 
    341 		case '\0':
    342 			switch (tok->quote) {
    343 			case Q_none:
    344 				/* Finish word and return */
    345 				if (tok->flags & TOK_EAT) {
    346 					tok->flags &= ~TOK_EAT;
    347 					return 3;
    348 				}
    349 				goto tok_line_outok;
    350 
    351 			case Q_single:
    352 				return 1;
    353 
    354 			case Q_double:
    355 				return 2;
    356 
    357 			case Q_doubleone:
    358 				tok->quote = Q_double;
    359 				*tok->wptr++ = *ptr;
    360 				break;
    361 
    362 			case Q_one:
    363 				tok->quote = Q_none;
    364 				*tok->wptr++ = *ptr;
    365 				break;
    366 
    367 			default:
    368 				return -1;
    369 			}
    370 			break;
    371 
    372 		default:
    373 			tok->flags &= ~TOK_EAT;
    374 			switch (tok->quote) {
    375 			case Q_none:
    376 				if (Strchr(tok->ifs, *ptr) != NULL)
    377 					FUN(tok,finish)(tok);
    378 				else
    379 					*tok->wptr++ = *ptr;
    380 				break;
    381 
    382 			case Q_single:
    383 			case Q_double:
    384 				*tok->wptr++ = *ptr;
    385 				break;
    386 
    387 
    388 			case Q_doubleone:
    389 				*tok->wptr++ = '\\';
    390 				tok->quote = Q_double;
    391 				*tok->wptr++ = *ptr;
    392 				break;
    393 
    394 			case Q_one:
    395 				tok->quote = Q_none;
    396 				*tok->wptr++ = *ptr;
    397 				break;
    398 
    399 			default:
    400 				return -1;
    401 
    402 			}
    403 			break;
    404 		}
    405 
    406 		if (tok->wptr >= tok->wmax - 4) {
    407 			size_t size = (size_t)(tok->wmax - tok->wspace + WINCR);
    408 			Char *s = tok_realloc(tok->wspace,
    409 			    size * sizeof(*s));
    410 			if (s == NULL)
    411 				return -1;
    412 
    413 			if (s != tok->wspace) {
    414 				size_t i;
    415 				for (i = 0; i < tok->argc; i++) {
    416 				    tok->argv[i] =
    417 					(tok->argv[i] - tok->wspace) + s;
    418 				}
    419 				tok->wptr = (tok->wptr - tok->wspace) + s;
    420 				tok->wstart = (tok->wstart - tok->wspace) + s;
    421 				tok->wspace = s;
    422 			}
    423 			tok->wmax = s + size;
    424 		}
    425 		if (tok->argc >= tok->amax - 4) {
    426 			Char **p;
    427 			tok->amax += AINCR;
    428 			p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
    429 			if (p == NULL) {
    430 				tok->amax -= AINCR;
    431 				return -1;
    432 			}
    433 			tok->argv = p;
    434 		}
    435 	}
    436  tok_line_outok:
    437 	if (cc == -1 && co == -1) {
    438 		cc = (int)tok->argc;
    439 		co = (int)(tok->wptr - tok->wstart);
    440 	}
    441 	if (cursorc != NULL)
    442 		*cursorc = cc;
    443 	if (cursoro != NULL)
    444 		*cursoro = co;
    445 	FUN(tok,finish)(tok);
    446 	*argv = (const Char **)tok->argv;
    447 	*argc = (int)tok->argc;
    448 	return 0;
    449 }
    450 
    451 /* FUN(tok,str)():
    452  *	Simpler version of tok_line, taking a NUL terminated line
    453  *	and splitting into words, ignoring cursor state.
    454  */
    455 public int
    456 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
    457     const Char ***argv)
    458 {
    459 	TYPE(LineInfo) li;
    460 
    461 	memset(&li, 0, sizeof(li));
    462 	li.buffer = line;
    463 	li.cursor = li.lastchar = Strchr(line, '\0');
    464 	return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL);
    465 }
    466