tokenizer.c revision 1.3 1 /* $NetBSD: tokenizer.c,v 1.3 1997/07/06 18:25:37 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Christos Zoulas of Cornell University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #if !defined(lint) && !defined(SCCSID)
41 #if 0
42 static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93";
43 #else
44 __RCSID("$NetBSD: tokenizer.c,v 1.3 1997/07/06 18:25:37 christos Exp $");
45 #endif
46 #endif /* not lint && not SCCSID */
47
48 /*
49 * tokenize.c: Bourne shell like tokenizer
50 */
51 #include "sys.h"
52 #include <string.h>
53 #include <stdlib.h>
54 #include "tokenizer.h"
55
56 typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
57
58 #define IFS "\t \n"
59
60 #define TOK_KEEP 1
61 #define TOK_EAT 2
62
63 #define WINCR 20
64 #define AINCR 10
65
66 #define tok_malloc(a) malloc(a)
67 #define tok_free(a) free(a)
68 #define tok_realloc(a, b) realloc(a, b)
69
70
71 struct tokenizer {
72 char *ifs; /* In field separator */
73 int argc, amax; /* Current and maximum number of args */
74 char **argv; /* Argument list */
75 char *wptr, *wmax; /* Space and limit on the word buffer */
76 char *wstart; /* Beginning of next word */
77 char *wspace; /* Space of word buffer */
78 quote_t quote; /* Quoting state */
79 int flags; /* flags; */
80 };
81
82
83 private void tok_finish __P((Tokenizer *));
84
85
86 /* tok_finish():
87 * Finish a word in the tokenizer.
88 */
89 private void
90 tok_finish(tok)
91 Tokenizer *tok;
92 {
93 *tok->wptr = '\0';
94 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
95 tok->argv[tok->argc++] = tok->wstart;
96 tok->argv[tok->argc] = NULL;
97 tok->wstart = ++tok->wptr;
98 }
99 tok->flags &= ~TOK_KEEP;
100 }
101
102
103 /* tok_init():
104 * Initialize the tokenizer
105 */
106 public Tokenizer *
107 tok_init(ifs)
108 const char *ifs;
109 {
110 Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
111
112 tok->ifs = strdup(ifs ? ifs : IFS);
113 tok->argc = 0;
114 tok->amax = AINCR;
115 tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
116 tok->argv[0] = NULL;
117 tok->wspace = (char *) tok_malloc(WINCR);
118 tok->wmax = tok->wspace + WINCR;
119 tok->wstart = tok->wspace;
120 tok->wptr = tok->wspace;
121 tok->flags = 0;
122 tok->quote = Q_none;
123
124 return tok;
125 }
126
127
128 /* tok_reset():
129 * Reset the tokenizer
130 */
131 public void
132 tok_reset(tok)
133 Tokenizer *tok;
134 {
135 tok->argc = 0;
136 tok->wstart = tok->wspace;
137 tok->wptr = tok->wspace;
138 tok->flags = 0;
139 tok->quote = Q_none;
140 }
141
142
143 /* tok_end():
144 * Clean up
145 */
146 public void
147 tok_end(tok)
148 Tokenizer *tok;
149 {
150 tok_free((ptr_t) tok->ifs);
151 tok_free((ptr_t) tok->wspace);
152 tok_free((ptr_t) tok->argv);
153 tok_free((ptr_t) tok);
154 }
155
156
157
158 /* tok_line():
159 * Bourne shell like tokenizing
160 * Return:
161 * -1: Internal error
162 * 3: Quoted return
163 * 2: Unmatched double quote
164 * 1: Unmatched single quote
165 * 0: Ok
166 */
167 public int
168 tok_line(tok, line, argc, argv)
169 Tokenizer *tok;
170 const char* line;
171 int *argc;
172 char ***argv;
173 {
174 const char *ptr;
175
176 while (1) {
177 switch (*(ptr = line++)) {
178 case '\'':
179 tok->flags |= TOK_KEEP;
180 tok->flags &= ~TOK_EAT;
181 switch (tok->quote) {
182 case Q_none:
183 tok->quote = Q_single; /* Enter single quote mode */
184 break;
185
186 case Q_single: /* Exit single quote mode */
187 tok->quote = Q_none;
188 break;
189
190 case Q_one: /* Quote this ' */
191 tok->quote = Q_none;
192 *tok->wptr++ = *ptr;
193 break;
194
195 case Q_double: /* Stay in double quote mode */
196 *tok->wptr++ = *ptr;
197 break;
198
199 case Q_doubleone: /* Quote this ' */
200 tok->quote = Q_double;
201 *tok->wptr++ = *ptr;
202 break;
203
204 default:
205 return(-1);
206 }
207 break;
208
209 case '"':
210 tok->flags &= ~TOK_EAT;
211 tok->flags |= TOK_KEEP;
212 switch (tok->quote) {
213 case Q_none: /* Enter double quote mode */
214 tok->quote = Q_double;
215 break;
216
217 case Q_double:
218 tok->quote = Q_none; /* Exit double quote mode */
219 break;
220
221 case Q_one: /* Quote this " */
222 tok->quote = Q_none;
223 *tok->wptr++ = *ptr;
224 break;
225
226 case Q_single: /* Stay in single quote mode */
227 *tok->wptr++ = *ptr;
228 break;
229
230 case Q_doubleone: /* Quote this " */
231 tok->quote = Q_double;
232 *tok->wptr++ = *ptr;
233 break;
234
235 default:
236 return(-1);
237 }
238 break;
239
240 case '\\':
241 tok->flags |= TOK_KEEP;
242 tok->flags &= ~TOK_EAT;
243 switch (tok->quote) {
244 case Q_none: /* Quote next character */
245 tok->quote = Q_one;
246 break;
247
248 case Q_double:
249 tok->quote = Q_doubleone;/* Quote next character */
250 break;
251
252 case Q_one:
253 *tok->wptr++ = *ptr;
254 tok->quote = Q_none; /* Quote this, restore state */
255 break;
256
257 case Q_single: /* Stay in single quote mode */
258 *tok->wptr++ = *ptr;
259 break;
260
261 case Q_doubleone: /* Quote this \ */
262 tok->quote = Q_double;
263 *tok->wptr++ = *ptr;
264 break;
265
266 default:
267 return(-1);
268 }
269 break;
270
271 case '\n':
272 tok->flags &= ~TOK_EAT;
273 switch (tok->quote) {
274 case Q_none:
275 tok_finish(tok);
276 *argv = tok->argv;
277 *argc = tok->argc;
278 return(0);
279
280 case Q_single:
281 case Q_double:
282 *tok->wptr++ = *ptr; /* Add the return */
283 break;
284
285 case Q_doubleone:
286 tok->flags |= TOK_EAT;
287 tok->quote = Q_double; /* Back to double, eat the '\n' */
288 break;
289
290 case Q_one:
291 tok->flags |= TOK_EAT;
292 tok->quote = Q_none; /* No quote, more eat the '\n' */
293 break;
294
295 default:
296 return(0);
297 }
298 break;
299
300 case '\0':
301 switch (tok->quote) {
302 case Q_none:
303 /* Finish word and return */
304 if (tok->flags & TOK_EAT) {
305 tok->flags &= ~TOK_EAT;
306 return 3;
307 }
308 tok_finish(tok);
309 *argv = tok->argv;
310 *argc = tok->argc;
311 return(0);
312
313 case Q_single:
314 return(1);
315
316 case Q_double:
317 return(2);
318
319 case Q_doubleone:
320 tok->quote = Q_double;
321 *tok->wptr++ = *ptr;
322 break;
323
324 case Q_one:
325 tok->quote = Q_none;
326 *tok->wptr++ = *ptr;
327 break;
328
329 default:
330 return(-1);
331 }
332 break;
333
334 default:
335 tok->flags &= ~TOK_EAT;
336 switch (tok->quote) {
337 case Q_none:
338 if (strchr(tok->ifs, *ptr) != NULL)
339 tok_finish(tok);
340 else
341 *tok->wptr++ = *ptr;
342 break;
343
344 case Q_single:
345 case Q_double:
346 *tok->wptr++ = *ptr;
347 break;
348
349
350 case Q_doubleone:
351 *tok->wptr++ = '\\';
352 tok->quote = Q_double;
353 *tok->wptr++ = *ptr;
354 break;
355
356 case Q_one:
357 tok->quote = Q_none;
358 *tok->wptr++ = *ptr;
359 break;
360
361 default:
362 return(-1);
363
364 }
365 break;
366 }
367
368 if (tok->wptr >= tok->wmax - 4) {
369 size_t size = tok->wmax - tok->wspace + WINCR;
370 char *s = (char *) tok_realloc(tok->wspace, size);
371 /*SUPPRESS 22*/
372 int offs = s - tok->wspace;
373
374 if (offs != 0) {
375 int i;
376 for (i = 0; i < tok->argc; i++)
377 tok->argv[i] = tok->argv[i] + offs;
378 tok->wptr = tok->wptr + offs;
379 tok->wstart = tok->wstart + offs;
380 tok->wmax = s + size;
381 tok->wspace = s;
382 }
383 }
384
385 if (tok->argc >= tok->amax - 4) {
386 tok->amax += AINCR;
387 tok->argv = (char **) tok_realloc(tok->argv,
388 tok->amax * sizeof(char*));
389 }
390
391 }
392 }
393