Home | History | Annotate | Line # | Download | only in libedit
tokenizer.c revision 1.1.1.1
      1 /*-
      2  * Copyright (c) 1992, 1993
      3  *	The Regents of the University of California.  All rights reserved.
      4  *
      5  * This code is derived from software contributed to Berkeley by
      6  * Christos Zoulas of Cornell University.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. All advertising materials mentioning features or use of this software
     17  *    must display the following acknowledgement:
     18  *	This product includes software developed by the University of
     19  *	California, Berkeley and its contributors.
     20  * 4. Neither the name of the University nor the names of its contributors
     21  *    may be used to endorse or promote products derived from this software
     22  *    without specific prior written permission.
     23  *
     24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  * SUCH DAMAGE.
     35  */
     36 
     37 #if !defined(lint) && !defined(SCCSID)
     38 static char sccsid[] = "@(#)tokenizer.c	8.1 (Berkeley) 6/4/93";
     39 #endif /* not lint && not SCCSID */
     40 
     41 /*
     42  * tokenize.c: Bourne shell like tokenizer
     43  */
     44 #include "sys.h"
     45 #include <string.h>
     46 #include <stdlib.h>
     47 #include "tokenizer.h"
     48 
     49 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
     50 
     51 #define IFS "\t \n"
     52 
     53 #define TOK_KEEP	1
     54 #define TOK_EAT		2
     55 
     56 #define WINCR 20
     57 #define AINCR 10
     58 
     59 #define tok_malloc(a)		malloc(a)
     60 #define tok_free(a)		free(a)
     61 #define tok_realloc(a, b)	realloc(a, b)
     62 
     63 
     64 struct tokenizer {
     65     char   *ifs;		/* In field separator			*/
     66     int     argc, amax;		/* Current and maximum number of args	*/
     67     char  **argv;		/* Argument list			*/
     68     char   *wptr, *wmax;	/* Space and limit on the word buffer	*/
     69     char   *wstart;		/* Beginning of next word		*/
     70     char   *wspace;		/* Space of word buffer			*/
     71     quote_t quote;		/* Quoting state			*/
     72     int	    flags;		/* flags;				*/
     73 };
     74 
     75 
     76 private void tok_finish	__P((Tokenizer *));
     77 
     78 
     79 /* tok_finish():
     80  *	Finish a word in the tokenizer.
     81  */
     82 private void
     83 tok_finish(tok)
     84     Tokenizer *tok;
     85 {
     86     *tok->wptr = '\0';
     87     if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
     88 	tok->argv[tok->argc++] = tok->wstart;
     89 	tok->argv[tok->argc] = NULL;
     90 	tok->wstart = ++tok->wptr;
     91     }
     92     tok->flags &= ~TOK_KEEP;
     93 }
     94 
     95 
     96 /* tok_init():
     97  *	Initialize the tokenizer
     98  */
     99 public Tokenizer *
    100 tok_init(ifs)
    101     const char *ifs;
    102 {
    103     Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
    104 
    105     tok->ifs     = strdup(ifs ? ifs : IFS);
    106     tok->argc    = 0;
    107     tok->amax    = AINCR;
    108     tok->argv    = (char **) tok_malloc(sizeof(char *) * tok->amax);
    109     tok->argv[0] = NULL;
    110     tok->wspace  = (char *) tok_malloc(WINCR);
    111     tok->wmax    = tok->wspace + WINCR;
    112     tok->wstart  = tok->wspace;
    113     tok->wptr    = tok->wspace;
    114     tok->flags   = 0;
    115     tok->quote   = Q_none;
    116 
    117     return tok;
    118 }
    119 
    120 
    121 /* tok_reset():
    122  *	Reset the tokenizer
    123  */
    124 public void
    125 tok_reset(tok)
    126     Tokenizer *tok;
    127 {
    128     tok->argc  = 0;
    129     tok->wstart = tok->wspace;
    130     tok->wptr = tok->wspace;
    131     tok->flags = 0;
    132     tok->quote = Q_none;
    133 }
    134 
    135 
    136 /* tok_end():
    137  *	Clean up
    138  */
    139 public void
    140 tok_end(tok)
    141     Tokenizer *tok;
    142 {
    143     tok_free((ptr_t) tok->ifs);
    144     tok_free((ptr_t) tok->wspace);
    145     tok_free((ptr_t) tok->argv);
    146     tok_free((ptr_t) tok);
    147 }
    148 
    149 
    150 
    151 /* tok_line():
    152  *	Bourne shell like tokenizing
    153  *	Return:
    154  *		-1: Internal error
    155  *		 3: Quoted return
    156  *		 2: Unmatched double quote
    157  *		 1: Unmatched single quote
    158  *		 0: Ok
    159  */
    160 public int
    161 tok_line(tok, line, argc, argv)
    162     Tokenizer *tok;
    163     const char* line;
    164     int *argc;
    165     char ***argv;
    166 {
    167     const char *ptr;
    168 
    169     while (1) {
    170 	switch (*(ptr = line++)) {
    171 	case '\'':
    172 	    tok->flags |= TOK_KEEP;
    173 	    tok->flags &= ~TOK_EAT;
    174 	    switch (tok->quote) {
    175 	    case Q_none:
    176 		tok->quote = Q_single;	/* Enter single quote mode */
    177 		break;
    178 
    179 	    case Q_single:		/* Exit single quote mode */
    180 		tok->quote = Q_none;
    181 		break;
    182 
    183 	    case Q_one:			/* Quote this ' */
    184 		tok->quote = Q_none;
    185 		*tok->wptr++ = *ptr;
    186 		break;
    187 
    188 	    case Q_double:		/* Stay in double quote mode */
    189 		*tok->wptr++ = *ptr;
    190 		break;
    191 
    192 	    case Q_doubleone:		/* Quote this ' */
    193 		tok->quote = Q_double;
    194 		*tok->wptr++ = *ptr;
    195 		break;
    196 
    197 	    default:
    198 		return(-1);
    199 	    }
    200 	    break;
    201 
    202 	case '"':
    203 	    tok->flags &= ~TOK_EAT;
    204 	    tok->flags |= TOK_KEEP;
    205 	    switch (tok->quote) {
    206 	    case Q_none:		/* Enter double quote mode */
    207 		tok->quote = Q_double;
    208 		break;
    209 
    210 	    case Q_double:
    211 		tok->quote = Q_none;	/* Exit double quote mode */
    212 		break;
    213 
    214 	    case Q_one:			/* Quote this " */
    215 		tok->quote = Q_none;
    216 		*tok->wptr++ = *ptr;
    217 		break;
    218 
    219 	    case Q_single:		/* Stay in single quote mode */
    220 		*tok->wptr++ = *ptr;
    221 		break;
    222 
    223 	    case Q_doubleone:		/* Quote this " */
    224 		tok->quote = Q_double;
    225 		*tok->wptr++ = *ptr;
    226 		break;
    227 
    228 	    default:
    229 		return(-1);
    230 	    }
    231 	    break;
    232 
    233 	case '\\':
    234 	    tok->flags |= TOK_KEEP;
    235 	    tok->flags &= ~TOK_EAT;
    236 	    switch (tok->quote) {
    237 	    case Q_none:		/* Quote next character */
    238 		tok->quote = Q_one;
    239 		break;
    240 
    241 	    case Q_double:
    242 		tok->quote = Q_doubleone;/* Quote next character */
    243 		break;
    244 
    245 	    case Q_one:
    246 		*tok->wptr++ = *ptr;
    247 		tok->quote = Q_none;	/* Quote this, restore state */
    248 		break;
    249 
    250 	    case Q_single:		/* Stay in single quote mode */
    251 		*tok->wptr++ = *ptr;
    252 		break;
    253 
    254 	    case Q_doubleone:		/* Quote this \ */
    255 		tok->quote = Q_double;
    256 		*tok->wptr++ = *ptr;
    257 		break;
    258 
    259 	    default:
    260 		return(-1);
    261 	    }
    262 	    break;
    263 
    264 	case '\n':
    265 	    tok->flags &= ~TOK_EAT;
    266 	    switch (tok->quote) {
    267 	    case Q_none:
    268 		tok_finish(tok);
    269 		*argv = tok->argv;
    270 		*argc = tok->argc;
    271 		return(0);
    272 
    273 	    case Q_single:
    274 	    case Q_double:
    275 		*tok->wptr++ = *ptr;	/* Add the return		*/
    276 		break;
    277 
    278 	    case Q_doubleone:
    279 		tok->flags |= TOK_EAT;
    280 		tok->quote = Q_double;	/* Back to double, eat the '\n' */
    281 		break;
    282 
    283 	    case Q_one:
    284 		tok->flags |= TOK_EAT;
    285 		tok->quote = Q_none;	/* No quote, more eat the '\n' */
    286 		break;
    287 
    288 	    default:
    289 		return(0);
    290 	    }
    291 	    break;
    292 
    293 	case '\0':
    294 	    switch (tok->quote) {
    295 	    case Q_none:
    296 		/* Finish word and return */
    297 		if (tok->flags & TOK_EAT) {
    298 		    tok->flags &= ~TOK_EAT;
    299 		    return 3;
    300 		}
    301 		tok_finish(tok);
    302 		*argv = tok->argv;
    303 		*argc = tok->argc;
    304 		return(0);
    305 
    306 	    case Q_single:
    307 		return(1);
    308 
    309 	    case Q_double:
    310 		return(2);
    311 
    312 	    case Q_doubleone:
    313 		tok->quote = Q_double;
    314 		*tok->wptr++ = *ptr;
    315 		break;
    316 
    317 	    case Q_one:
    318 		tok->quote = Q_none;
    319 		*tok->wptr++ = *ptr;
    320 		break;
    321 
    322 	    default:
    323 		return(-1);
    324 	    }
    325 	    break;
    326 
    327 	default:
    328 	    tok->flags &= ~TOK_EAT;
    329 	    switch (tok->quote) {
    330 	    case Q_none:
    331 		if (strchr(tok->ifs, *ptr) != NULL)
    332 		    tok_finish(tok);
    333 		else
    334 		    *tok->wptr++ = *ptr;
    335 		break;
    336 
    337 	    case Q_single:
    338 	    case Q_double:
    339 		*tok->wptr++ = *ptr;
    340 		break;
    341 
    342 
    343 	    case Q_doubleone:
    344 		*tok->wptr++ = '\\';
    345 		tok->quote = Q_double;
    346 		*tok->wptr++ = *ptr;
    347 		break;
    348 
    349 	    case Q_one:
    350 		tok->quote = Q_none;
    351 		*tok->wptr++ = *ptr;
    352 		break;
    353 
    354 	    default:
    355 		return(-1);
    356 
    357 	    }
    358 	    break;
    359 	}
    360 
    361 	if (tok->wptr >= tok->wmax - 4) {
    362 	    size_t size = tok->wmax - tok->wspace + WINCR;
    363 	    char *s = (char *) tok_realloc(tok->wspace, size);
    364 	    /*SUPPRESS 22*/
    365 	    int offs = s - tok->wspace;
    366 
    367 	    if (offs != 0) {
    368 		int i;
    369 		for (i = 0; i < tok->argc; i++)
    370 		    tok->argv[i] = tok->argv[i] + offs;
    371 		tok->wptr   = tok->wptr + offs;
    372 		tok->wstart = tok->wstart + offs;
    373 		tok->wmax   = s + size;
    374 		tok->wspace = s;
    375 	    }
    376 	}
    377 
    378 	if (tok->argc >= tok->amax - 4) {
    379 	    tok->amax += AINCR;
    380 	    tok->argv = (char **) tok_realloc(tok->argv,
    381 					      tok->amax * sizeof(char*));
    382 	}
    383 
    384     }
    385 }
    386