Home | History | Annotate | Line # | Download | only in libedit
      1 /*	$NetBSD: tokenizer.c,v 1.29 2023/05/30 11:53:40 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1992, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Christos Zoulas of Cornell University.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. Neither the name of the University nor the names of its contributors
     19  *    may be used to endorse or promote products derived from this software
     20  *    without specific prior written permission.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32  * SUCH DAMAGE.
     33  */
     34 
     35 #include "config.h"
     36 #if !defined(lint) && !defined(SCCSID)
     37 #if 0
     38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
     39 #else
     40 __RCSID("$NetBSD: tokenizer.c,v 1.29 2023/05/30 11:53:40 christos Exp $");
     41 #endif
     42 #endif /* not lint && not SCCSID */
     43 
     44 /* We build this file twice, once as NARROW, once as WIDE. */
     45 /*
     46  * tokenize.c: Bourne shell like tokenizer
     47  */
     48 #include <stdlib.h>
     49 #include <string.h>
     50 
     51 #include "histedit.h"
     52 
     53 typedef enum {
     54 	Q_none, Q_single, Q_double, Q_one, Q_doubleone
     55 } quote_t;
     56 
     57 #define	TOK_KEEP	1
     58 #define	TOK_EAT		2
     59 
     60 #define	WINCR		20
     61 #define	AINCR		10
     62 
     63 #define	IFS		STR("\t \n")
     64 
     65 #define	tok_malloc(a)		malloc(a)
     66 #define	tok_free(a)		free(a)
     67 #define	tok_realloc(a, b)	realloc(a, b)
     68 
     69 #ifdef NARROWCHAR
     70 #define	Char			char
     71 #define	FUN(prefix, rest)	prefix ## _ ## rest
     72 #define	TYPE(type)		type
     73 #define	STR(x)			x
     74 #define	Strchr(s, c)		strchr(s, c)
     75 #define	tok_strdup(s)		strdup(s)
     76 #else
     77 #define	Char			wchar_t
     78 #define	FUN(prefix, rest)	prefix ## _w ## rest
     79 #define	TYPE(type)		type ## W
     80 #define	STR(x)			L ## x
     81 #define	Strchr(s, c)		wcschr(s, c)
     82 #define	tok_strdup(s)		wcsdup(s)
     83 #endif
     84 
     85 struct TYPE(tokenizer) {
     86 	Char	*ifs;		/* In field separator			 */
     87 	size_t	 argc, amax;	/* Current and maximum number of args	 */
     88 	const Char   **argv;	/* Argument list			 */
     89 	Char	*wptr, *wmax;	/* Space and limit on the word buffer	 */
     90 	Char	*wstart;	/* Beginning of next word		 */
     91 	Char	*wspace;	/* Space of word buffer			 */
     92 	quote_t	 quote;		/* Quoting state			 */
     93 	int	 flags;		/* flags;				 */
     94 };
     95 
     96 
     97 static void FUN(tok,finish)(TYPE(Tokenizer) *);
     98 
     99 
    100 /* FUN(tok,finish)():
    101  *	Finish a word in the tokenizer.
    102  */
    103 static void
    104 FUN(tok,finish)(TYPE(Tokenizer) *tok)
    105 {
    106 
    107 	*tok->wptr = '\0';
    108 	if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
    109 		tok->argv[tok->argc++] = tok->wstart;
    110 		tok->argv[tok->argc] = NULL;
    111 		tok->wstart = ++tok->wptr;
    112 	}
    113 	tok->flags &= ~TOK_KEEP;
    114 }
    115 
    116 
    117 /* FUN(tok,init)():
    118  *	Initialize the tokenizer
    119  */
    120 TYPE(Tokenizer) *
    121 FUN(tok,init)(const Char *ifs)
    122 {
    123 	TYPE(Tokenizer) *tok = tok_malloc(sizeof(*tok));
    124 
    125 	if (tok == NULL)
    126 		return NULL;
    127 	tok->ifs = tok_strdup(ifs ? ifs : IFS);
    128 	if (tok->ifs == NULL) {
    129 		tok_free(tok);
    130 		return NULL;
    131 	}
    132 	tok->argc = 0;
    133 	tok->amax = AINCR;
    134 	tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
    135 	if (tok->argv == NULL) {
    136 		tok_free(tok->ifs);
    137 		tok_free(tok);
    138 		return NULL;
    139 	}
    140 	tok->argv[0] = NULL;
    141 	tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
    142 	if (tok->wspace == NULL) {
    143 		tok_free(tok->argv);
    144 		tok_free(tok->ifs);
    145 		tok_free(tok);
    146 		return NULL;
    147 	}
    148 	tok->wmax = tok->wspace + WINCR;
    149 	tok->wstart = tok->wspace;
    150 	tok->wptr = tok->wspace;
    151 	tok->flags = 0;
    152 	tok->quote = Q_none;
    153 
    154 	return tok;
    155 }
    156 
    157 
    158 /* FUN(tok,reset)():
    159  *	Reset the tokenizer
    160  */
    161 void
    162 FUN(tok,reset)(TYPE(Tokenizer) *tok)
    163 {
    164 
    165 	tok->argc = 0;
    166 	tok->wstart = tok->wspace;
    167 	tok->wptr = tok->wspace;
    168 	tok->flags = 0;
    169 	tok->quote = Q_none;
    170 }
    171 
    172 
    173 /* FUN(tok,end)():
    174  *	Clean up
    175  */
    176 void
    177 FUN(tok,end)(TYPE(Tokenizer) *tok)
    178 {
    179 
    180 	tok_free(tok->ifs);
    181 	tok_free(tok->wspace);
    182 	tok_free(tok->argv);
    183 	tok_free(tok);
    184 }
    185 
    186 
    187 
    188 /* FUN(tok,line)():
    189  *	Bourne shell (sh(1)) like tokenizing
    190  *	Arguments:
    191  *		tok	current tokenizer state (setup with FUN(tok,init)())
    192  *		line	line to parse
    193  *	Returns:
    194  *		-1	Internal error
    195  *		 3	Quoted return
    196  *		 2	Unmatched double quote
    197  *		 1	Unmatched single quote
    198  *		 0	Ok
    199  *	Modifies (if return value is 0):
    200  *		argc	number of arguments
    201  *		argv	argument array
    202  *		cursorc	if !NULL, argv element containing cursor
    203  *		cursorv	if !NULL, offset in argv[cursorc] of cursor
    204  */
    205 int
    206 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
    207     int *argc, const Char ***argv, int *cursorc, int *cursoro)
    208 {
    209 	const Char *ptr;
    210 	int cc, co;
    211 
    212 	cc = co = -1;
    213 	ptr = line->buffer;
    214 	for (ptr = line->buffer; ;ptr++) {
    215 		if (ptr >= line->lastchar)
    216 			ptr = STR("");
    217 		if (ptr == line->cursor) {
    218 			cc = (int)tok->argc;
    219 			co = (int)(tok->wptr - tok->wstart);
    220 		}
    221 		switch (*ptr) {
    222 		case '\'':
    223 			tok->flags |= TOK_KEEP;
    224 			tok->flags &= ~TOK_EAT;
    225 			switch (tok->quote) {
    226 			case Q_none:
    227 				tok->quote = Q_single;	/* Enter single quote
    228 							 * mode */
    229 				break;
    230 
    231 			case Q_single:	/* Exit single quote mode */
    232 				tok->quote = Q_none;
    233 				break;
    234 
    235 			case Q_one:	/* Quote this ' */
    236 				tok->quote = Q_none;
    237 				*tok->wptr++ = *ptr;
    238 				break;
    239 
    240 			case Q_double:	/* Stay in double quote mode */
    241 				*tok->wptr++ = *ptr;
    242 				break;
    243 
    244 			case Q_doubleone:	/* Quote this ' */
    245 				tok->quote = Q_double;
    246 				*tok->wptr++ = *ptr;
    247 				break;
    248 
    249 			default:
    250 				return -1;
    251 			}
    252 			break;
    253 
    254 		case '"':
    255 			tok->flags &= ~TOK_EAT;
    256 			tok->flags |= TOK_KEEP;
    257 			switch (tok->quote) {
    258 			case Q_none:	/* Enter double quote mode */
    259 				tok->quote = Q_double;
    260 				break;
    261 
    262 			case Q_double:	/* Exit double quote mode */
    263 				tok->quote = Q_none;
    264 				break;
    265 
    266 			case Q_one:	/* Quote this " */
    267 				tok->quote = Q_none;
    268 				*tok->wptr++ = *ptr;
    269 				break;
    270 
    271 			case Q_single:	/* Stay in single quote mode */
    272 				*tok->wptr++ = *ptr;
    273 				break;
    274 
    275 			case Q_doubleone:	/* Quote this " */
    276 				tok->quote = Q_double;
    277 				*tok->wptr++ = *ptr;
    278 				break;
    279 
    280 			default:
    281 				return -1;
    282 			}
    283 			break;
    284 
    285 		case '\\':
    286 			tok->flags |= TOK_KEEP;
    287 			tok->flags &= ~TOK_EAT;
    288 			switch (tok->quote) {
    289 			case Q_none:	/* Quote next character */
    290 				tok->quote = Q_one;
    291 				break;
    292 
    293 			case Q_double:	/* Quote next character */
    294 				tok->quote = Q_doubleone;
    295 				break;
    296 
    297 			case Q_one:	/* Quote this, restore state */
    298 				*tok->wptr++ = *ptr;
    299 				tok->quote = Q_none;
    300 				break;
    301 
    302 			case Q_single:	/* Stay in single quote mode */
    303 				*tok->wptr++ = *ptr;
    304 				break;
    305 
    306 			case Q_doubleone:	/* Quote this \ */
    307 				tok->quote = Q_double;
    308 				*tok->wptr++ = *ptr;
    309 				break;
    310 
    311 			default:
    312 				return -1;
    313 			}
    314 			break;
    315 
    316 		case '\n':
    317 			tok->flags &= ~TOK_EAT;
    318 			switch (tok->quote) {
    319 			case Q_none:
    320 				goto tok_line_outok;
    321 
    322 			case Q_single:
    323 			case Q_double:
    324 				*tok->wptr++ = *ptr;	/* Add the return */
    325 				break;
    326 
    327 			case Q_doubleone:   /* Back to double, eat the '\n' */
    328 				tok->flags |= TOK_EAT;
    329 				tok->quote = Q_double;
    330 				break;
    331 
    332 			case Q_one:	/* No quote, more eat the '\n' */
    333 				tok->flags |= TOK_EAT;
    334 				tok->quote = Q_none;
    335 				break;
    336 
    337 			default:
    338 				return 0;
    339 			}
    340 			break;
    341 
    342 		case '\0':
    343 			switch (tok->quote) {
    344 			case Q_none:
    345 				/* Finish word and return */
    346 				if (tok->flags & TOK_EAT) {
    347 					tok->flags &= ~TOK_EAT;
    348 					return 3;
    349 				}
    350 				goto tok_line_outok;
    351 
    352 			case Q_single:
    353 				return 1;
    354 
    355 			case Q_double:
    356 				return 2;
    357 
    358 			case Q_doubleone:
    359 				tok->quote = Q_double;
    360 				*tok->wptr++ = *ptr;
    361 				break;
    362 
    363 			case Q_one:
    364 				tok->quote = Q_none;
    365 				*tok->wptr++ = *ptr;
    366 				break;
    367 
    368 			default:
    369 				return -1;
    370 			}
    371 			break;
    372 
    373 		default:
    374 			tok->flags &= ~TOK_EAT;
    375 			switch (tok->quote) {
    376 			case Q_none:
    377 				if (Strchr(tok->ifs, *ptr) != NULL)
    378 					FUN(tok,finish)(tok);
    379 				else
    380 					*tok->wptr++ = *ptr;
    381 				break;
    382 
    383 			case Q_single:
    384 			case Q_double:
    385 				*tok->wptr++ = *ptr;
    386 				break;
    387 
    388 
    389 			case Q_doubleone:
    390 				*tok->wptr++ = '\\';
    391 				tok->quote = Q_double;
    392 				*tok->wptr++ = *ptr;
    393 				break;
    394 
    395 			case Q_one:
    396 				tok->quote = Q_none;
    397 				*tok->wptr++ = *ptr;
    398 				break;
    399 
    400 			default:
    401 				return -1;
    402 
    403 			}
    404 			break;
    405 		}
    406 
    407 		if (tok->wptr >= tok->wmax - 4) {
    408 			size_t size = (size_t)(tok->wmax - tok->wspace + WINCR);
    409 			Char *s = tok_realloc(tok->wspace,
    410 			    size * sizeof(*s));
    411 			if (s == NULL)
    412 				return -1;
    413 
    414 			if (s != tok->wspace) {
    415 				size_t i;
    416 				for (i = 0; i < tok->argc; i++) {
    417 				    tok->argv[i] =
    418 					(tok->argv[i] - tok->wspace) + s;
    419 				}
    420 				tok->wptr = (tok->wptr - tok->wspace) + s;
    421 				tok->wstart = (tok->wstart - tok->wspace) + s;
    422 				tok->wspace = s;
    423 			}
    424 			tok->wmax = s + size;
    425 		}
    426 		if (tok->argc >= tok->amax - 4) {
    427 			const Char **p;
    428 			tok->amax += AINCR;
    429 			p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
    430 			if (p == NULL) {
    431 				tok->amax -= AINCR;
    432 				return -1;
    433 			}
    434 			tok->argv = p;
    435 		}
    436 	}
    437  tok_line_outok:
    438 	if (cc == -1 && co == -1) {
    439 		cc = (int)tok->argc;
    440 		co = (int)(tok->wptr - tok->wstart);
    441 	}
    442 	if (cursorc != NULL)
    443 		*cursorc = cc;
    444 	if (cursoro != NULL)
    445 		*cursoro = co;
    446 	FUN(tok,finish)(tok);
    447 	*argv = tok->argv;
    448 	*argc = (int)tok->argc;
    449 	return 0;
    450 }
    451 
    452 /* FUN(tok,str)():
    453  *	Simpler version of tok_line, taking a NUL terminated line
    454  *	and splitting into words, ignoring cursor state.
    455  */
    456 int
    457 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
    458     const Char ***argv)
    459 {
    460 	TYPE(LineInfo) li;
    461 
    462 	memset(&li, 0, sizeof(li));
    463 	li.buffer = line;
    464 	li.cursor = li.lastchar = Strchr(line, '\0');
    465 	return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL);
    466 }
    467