Home | History | Annotate | Line # | Download | only in libedit
tokenizer.c revision 1.2
      1 /*	$NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1992, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Christos Zoulas of Cornell University.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  */
     38 
     39 #if !defined(lint) && !defined(SCCSID)
     40 #if 0
     41 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
     42 #else
     43 static char rcsid[] = "$NetBSD: tokenizer.c,v 1.2 1997/01/11 06:48:15 lukem Exp $";
     44 #endif
     45 #endif /* not lint && not SCCSID */
     46 
     47 /*
     48  * tokenize.c: Bourne shell like tokenizer
     49  */
     50 #include "sys.h"
     51 #include <string.h>
     52 #include <stdlib.h>
     53 #include "tokenizer.h"
     54 
     55 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
     56 
     57 #define IFS "\t \n"
     58 
     59 #define TOK_KEEP	1
     60 #define TOK_EAT		2
     61 
     62 #define WINCR 20
     63 #define AINCR 10
     64 
     65 #define tok_malloc(a)		malloc(a)
     66 #define tok_free(a)		free(a)
     67 #define tok_realloc(a, b)	realloc(a, b)
     68 
     69 
     70 struct tokenizer {
     71     char   *ifs;		/* In field separator			*/
     72     int     argc, amax;		/* Current and maximum number of args	*/
     73     char  **argv;		/* Argument list			*/
     74     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
     75     char   *wstart;		/* Beginning of next word		*/
     76     char   *wspace;		/* Space of word buffer			*/
     77     quote_t quote;		/* Quoting state			*/
     78     int	    flags;		/* flags;				*/
     79 };
     80 
     81 
     82 private void tok_finish	__P((Tokenizer *));
     83 
     84 
     85 /* tok_finish():
     86  *	Finish a word in the tokenizer.
     87  */
     88 private void
     89 tok_finish(tok)
     90     Tokenizer *tok;
     91 {
     92     *tok->wptr = '\0';
     93     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
     94 	tok->argv[tok->argc++] = tok->wstart;
     95 	tok->argv[tok->argc] = NULL;
     96 	tok->wstart = ++tok->wptr;
     97     }
     98     tok->flags &= ~TOK_KEEP;
     99 }
    100 
    101 
    102 /* tok_init():
    103  *	Initialize the tokenizer
    104  */
    105 public Tokenizer *
    106 tok_init(ifs)
    107     const char *ifs;
    108 {
    109     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
    110 
    111     tok->ifs     = strdup(ifs ? ifs : IFS);
    112     tok->argc    = 0;
    113     tok->amax    = AINCR;
    114     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
    115     tok->argv[0] = NULL;
    116     tok->wspace  = (char *) tok_malloc(WINCR);
    117     tok->wmax    = tok->wspace + WINCR;
    118     tok->wstart  = tok->wspace;
    119     tok->wptr    = tok->wspace;
    120     tok->flags   = 0;
    121     tok->quote   = Q_none;
    122 
    123     return tok;
    124 }
    125 
    126 
    127 /* tok_reset():
    128  *	Reset the tokenizer
    129  */
    130 public void
    131 tok_reset(tok)
    132     Tokenizer *tok;
    133 {
    134     tok->argc  = 0;
    135     tok->wstart = tok->wspace;
    136     tok->wptr = tok->wspace;
    137     tok->flags = 0;
    138     tok->quote = Q_none;
    139 }
    140 
    141 
    142 /* tok_end():
    143  *	Clean up
    144  */
    145 public void
    146 tok_end(tok)
    147     Tokenizer *tok;
    148 {
    149     tok_free((ptr_t) tok->ifs);
    150     tok_free((ptr_t) tok->wspace);
    151     tok_free((ptr_t) tok->argv);
    152     tok_free((ptr_t) tok);
    153 }
    154 
    155 
    156 
    157 /* tok_line():
    158  *	Bourne shell like tokenizing
    159  *	Return:
    160  *		-1: Internal error
    161  *		 3: Quoted return
    162  *		 2: Unmatched double quote
    163  *		 1: Unmatched single quote
    164  *		 0: Ok
    165  */
    166 public int
    167 tok_line(tok, line, argc, argv)
    168     Tokenizer *tok;
    169     const char* line;
    170     int *argc;
    171     char ***argv;
    172 {
    173     const char *ptr;
    174 
    175     while (1) {
    176 	switch (*(ptr = line++)) {
    177 	case '\'':
    178 	    tok->flags |= TOK_KEEP;
    179 	    tok->flags &= ~TOK_EAT;
    180 	    switch (tok->quote) {
    181 	    case Q_none:
    182 		tok->quote = Q_single;	/* Enter single quote mode */
    183 		break;
    184 
    185 	    case Q_single:		/* Exit single quote mode */
    186 		tok->quote = Q_none;
    187 		break;
    188 
    189 	    case Q_one:			/* Quote this ' */
    190 		tok->quote = Q_none;
    191 		*tok->wptr++ = *ptr;
    192 		break;
    193 
    194 	    case Q_double:		/* Stay in double quote mode */
    195 		*tok->wptr++ = *ptr;
    196 		break;
    197 
    198 	    case Q_doubleone:		/* Quote this ' */
    199 		tok->quote = Q_double;
    200 		*tok->wptr++ = *ptr;
    201 		break;
    202 
    203 	    default:
    204 		return(-1);
    205 	    }
    206 	    break;
    207 
    208 	case '"':
    209 	    tok->flags &= ~TOK_EAT;
    210 	    tok->flags |= TOK_KEEP;
    211 	    switch (tok->quote) {
    212 	    case Q_none:		/* Enter double quote mode */
    213 		tok->quote = Q_double;
    214 		break;
    215 
    216 	    case Q_double:
    217 		tok->quote = Q_none;	/* Exit double quote mode */
    218 		break;
    219 
    220 	    case Q_one:			/* Quote this " */
    221 		tok->quote = Q_none;
    222 		*tok->wptr++ = *ptr;
    223 		break;
    224 
    225 	    case Q_single:		/* Stay in single quote mode */
    226 		*tok->wptr++ = *ptr;
    227 		break;
    228 
    229 	    case Q_doubleone:		/* Quote this " */
    230 		tok->quote = Q_double;
    231 		*tok->wptr++ = *ptr;
    232 		break;
    233 
    234 	    default:
    235 		return(-1);
    236 	    }
    237 	    break;
    238 
    239 	case '\\':
    240 	    tok->flags |= TOK_KEEP;
    241 	    tok->flags &= ~TOK_EAT;
    242 	    switch (tok->quote) {
    243 	    case Q_none:		/* Quote next character */
    244 		tok->quote = Q_one;
    245 		break;
    246 
    247 	    case Q_double:
    248 		tok->quote = Q_doubleone;/* Quote next character */
    249 		break;
    250 
    251 	    case Q_one:
    252 		*tok->wptr++ = *ptr;
    253 		tok->quote = Q_none;	/* Quote this, restore state */
    254 		break;
    255 
    256 	    case Q_single:		/* Stay in single quote mode */
    257 		*tok->wptr++ = *ptr;
    258 		break;
    259 
    260 	    case Q_doubleone:		/* Quote this \ */
    261 		tok->quote = Q_double;
    262 		*tok->wptr++ = *ptr;
    263 		break;
    264 
    265 	    default:
    266 		return(-1);
    267 	    }
    268 	    break;
    269 
    270 	case '\n':
    271 	    tok->flags &= ~TOK_EAT;
    272 	    switch (tok->quote) {
    273 	    case Q_none:
    274 		tok_finish(tok);
    275 		*argv = tok->argv;
    276 		*argc = tok->argc;
    277 		return(0);
    278 
    279 	    case Q_single:
    280 	    case Q_double:
    281 		*tok->wptr++ = *ptr;	/* Add the return		*/
    282 		break;
    283 
    284 	    case Q_doubleone:
    285 		tok->flags |= TOK_EAT;
    286 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
    287 		break;
    288 
    289 	    case Q_one:
    290 		tok->flags |= TOK_EAT;
    291 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
    292 		break;
    293 
    294 	    default:
    295 		return(0);
    296 	    }
    297 	    break;
    298 
    299 	case '\0':
    300 	    switch (tok->quote) {
    301 	    case Q_none:
    302 		/* Finish word and return */
    303 		if (tok->flags & TOK_EAT) {
    304 		    tok->flags &= ~TOK_EAT;
    305 		    return 3;
    306 		}
    307 		tok_finish(tok);
    308 		*argv = tok->argv;
    309 		*argc = tok->argc;
    310 		return(0);
    311 
    312 	    case Q_single:
    313 		return(1);
    314 
    315 	    case Q_double:
    316 		return(2);
    317 
    318 	    case Q_doubleone:
    319 		tok->quote = Q_double;
    320 		*tok->wptr++ = *ptr;
    321 		break;
    322 
    323 	    case Q_one:
    324 		tok->quote = Q_none;
    325 		*tok->wptr++ = *ptr;
    326 		break;
    327 
    328 	    default:
    329 		return(-1);
    330 	    }
    331 	    break;
    332 
    333 	default:
    334 	    tok->flags &= ~TOK_EAT;
    335 	    switch (tok->quote) {
    336 	    case Q_none:
    337 		if (strchr(tok->ifs, *ptr) != NULL)
    338 		    tok_finish(tok);
    339 		else
    340 		    *tok->wptr++ = *ptr;
    341 		break;
    342 
    343 	    case Q_single:
    344 	    case Q_double:
    345 		*tok->wptr++ = *ptr;
    346 		break;
    347 
    348 
    349 	    case Q_doubleone:
    350 		*tok->wptr++ = '\\';
    351 		tok->quote = Q_double;
    352 		*tok->wptr++ = *ptr;
    353 		break;
    354 
    355 	    case Q_one:
    356 		tok->quote = Q_none;
    357 		*tok->wptr++ = *ptr;
    358 		break;
    359 
    360 	    default:
    361 		return(-1);
    362 
    363 	    }
    364 	    break;
    365 	}
    366 
    367 	if (tok->wptr >= tok->wmax - 4) {
    368 	    size_t size = tok->wmax - tok->wspace + WINCR;
    369 	    char *s = (char *) tok_realloc(tok->wspace, size);
    370 	    /*SUPPRESS 22*/
    371 	    int offs = s - tok->wspace;
    372 
    373 	    if (offs != 0) {
    374 		int i;
    375 		for (i = 0; i < tok->argc; i++)
    376 		    tok->argv[i] = tok->argv[i] + offs;
    377 		tok->wptr   = tok->wptr + offs;
    378 		tok->wstart = tok->wstart + offs;
    379 		tok->wmax   = s + size;
    380 		tok->wspace = s;
    381 	    }
    382 	}
    383 
    384 	if (tok->argc >= tok->amax - 4) {
    385 	    tok->amax += AINCR;
    386 	    tok->argv = (char **) tok_realloc(tok->argv,
    387 					      tok->amax * sizeof(char*));
    388 	}
    389 
    390     }
    391 }
    392