tokenizer.c revision 1.5 1 1.5 simonb /* $NetBSD: tokenizer.c,v 1.5 1999/07/02 15:21:27 simonb Exp $ */
2 1.2 lukem
3 1.1 cgd /*-
4 1.1 cgd * Copyright (c) 1992, 1993
5 1.1 cgd * The Regents of the University of California. All rights reserved.
6 1.1 cgd *
7 1.1 cgd * This code is derived from software contributed to Berkeley by
8 1.1 cgd * Christos Zoulas of Cornell University.
9 1.1 cgd *
10 1.1 cgd * Redistribution and use in source and binary forms, with or without
11 1.1 cgd * modification, are permitted provided that the following conditions
12 1.1 cgd * are met:
13 1.1 cgd * 1. Redistributions of source code must retain the above copyright
14 1.1 cgd * notice, this list of conditions and the following disclaimer.
15 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
16 1.1 cgd * notice, this list of conditions and the following disclaimer in the
17 1.1 cgd * documentation and/or other materials provided with the distribution.
18 1.1 cgd * 3. All advertising materials mentioning features or use of this software
19 1.1 cgd * must display the following acknowledgement:
20 1.1 cgd * This product includes software developed by the University of
21 1.1 cgd * California, Berkeley and its contributors.
22 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
23 1.1 cgd * may be used to endorse or promote products derived from this software
24 1.1 cgd * without specific prior written permission.
25 1.1 cgd *
26 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 1.1 cgd * SUCH DAMAGE.
37 1.1 cgd */
38 1.1 cgd
39 1.3 christos #include <sys/cdefs.h>
40 1.1 cgd #if !defined(lint) && !defined(SCCSID)
41 1.2 lukem #if 0
42 1.1 cgd static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93";
43 1.2 lukem #else
44 1.5 simonb __RCSID("$NetBSD: tokenizer.c,v 1.5 1999/07/02 15:21:27 simonb Exp $");
45 1.2 lukem #endif
46 1.1 cgd #endif /* not lint && not SCCSID */
47 1.1 cgd
48 1.1 cgd /*
49 1.1 cgd * tokenize.c: Bourne shell like tokenizer
50 1.1 cgd */
51 1.1 cgd #include "sys.h"
52 1.1 cgd #include <string.h>
53 1.1 cgd #include <stdlib.h>
54 1.1 cgd #include "tokenizer.h"
55 1.1 cgd
56 1.1 cgd typedef enum { Q_none, Q_single, Q_double, Q_one, Q_doubleone } quote_t;
57 1.1 cgd
58 1.1 cgd #define IFS "\t \n"
59 1.1 cgd
60 1.1 cgd #define TOK_KEEP 1
61 1.1 cgd #define TOK_EAT 2
62 1.1 cgd
63 1.1 cgd #define WINCR 20
64 1.1 cgd #define AINCR 10
65 1.1 cgd
66 1.1 cgd #define tok_malloc(a) malloc(a)
67 1.1 cgd #define tok_free(a) free(a)
68 1.1 cgd #define tok_realloc(a, b) realloc(a, b)
69 1.1 cgd
70 1.1 cgd
71 1.1 cgd struct tokenizer {
72 1.1 cgd char *ifs; /* In field separator */
73 1.1 cgd int argc, amax; /* Current and maximum number of args */
74 1.1 cgd char **argv; /* Argument list */
75 1.1 cgd char *wptr, *wmax; /* Space and limit on the word buffer */
76 1.1 cgd char *wstart; /* Beginning of next word */
77 1.1 cgd char *wspace; /* Space of word buffer */
78 1.1 cgd quote_t quote; /* Quoting state */
79 1.1 cgd int flags; /* flags; */
80 1.1 cgd };
81 1.1 cgd
82 1.1 cgd
83 1.1 cgd private void tok_finish __P((Tokenizer *));
84 1.1 cgd
85 1.1 cgd
86 1.1 cgd /* tok_finish():
87 1.1 cgd * Finish a word in the tokenizer.
88 1.1 cgd */
89 1.1 cgd private void
90 1.1 cgd tok_finish(tok)
91 1.1 cgd Tokenizer *tok;
92 1.1 cgd {
93 1.1 cgd *tok->wptr = '\0';
94 1.1 cgd if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
95 1.1 cgd tok->argv[tok->argc++] = tok->wstart;
96 1.1 cgd tok->argv[tok->argc] = NULL;
97 1.1 cgd tok->wstart = ++tok->wptr;
98 1.1 cgd }
99 1.1 cgd tok->flags &= ~TOK_KEEP;
100 1.1 cgd }
101 1.1 cgd
102 1.1 cgd
103 1.1 cgd /* tok_init():
104 1.1 cgd * Initialize the tokenizer
105 1.1 cgd */
106 1.1 cgd public Tokenizer *
107 1.1 cgd tok_init(ifs)
108 1.1 cgd const char *ifs;
109 1.1 cgd {
110 1.1 cgd Tokenizer* tok = (Tokenizer*) tok_malloc(sizeof(Tokenizer));
111 1.1 cgd
112 1.1 cgd tok->ifs = strdup(ifs ? ifs : IFS);
113 1.1 cgd tok->argc = 0;
114 1.1 cgd tok->amax = AINCR;
115 1.1 cgd tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
116 1.1 cgd tok->argv[0] = NULL;
117 1.1 cgd tok->wspace = (char *) tok_malloc(WINCR);
118 1.1 cgd tok->wmax = tok->wspace + WINCR;
119 1.1 cgd tok->wstart = tok->wspace;
120 1.1 cgd tok->wptr = tok->wspace;
121 1.1 cgd tok->flags = 0;
122 1.1 cgd tok->quote = Q_none;
123 1.1 cgd
124 1.1 cgd return tok;
125 1.1 cgd }
126 1.1 cgd
127 1.1 cgd
128 1.1 cgd /* tok_reset():
129 1.1 cgd * Reset the tokenizer
130 1.1 cgd */
131 1.1 cgd public void
132 1.1 cgd tok_reset(tok)
133 1.1 cgd Tokenizer *tok;
134 1.1 cgd {
135 1.1 cgd tok->argc = 0;
136 1.1 cgd tok->wstart = tok->wspace;
137 1.1 cgd tok->wptr = tok->wspace;
138 1.1 cgd tok->flags = 0;
139 1.1 cgd tok->quote = Q_none;
140 1.1 cgd }
141 1.1 cgd
142 1.1 cgd
143 1.1 cgd /* tok_end():
144 1.1 cgd * Clean up
145 1.1 cgd */
146 1.1 cgd public void
147 1.1 cgd tok_end(tok)
148 1.1 cgd Tokenizer *tok;
149 1.1 cgd {
150 1.1 cgd tok_free((ptr_t) tok->ifs);
151 1.1 cgd tok_free((ptr_t) tok->wspace);
152 1.1 cgd tok_free((ptr_t) tok->argv);
153 1.1 cgd tok_free((ptr_t) tok);
154 1.1 cgd }
155 1.1 cgd
156 1.1 cgd
157 1.1 cgd
158 1.1 cgd /* tok_line():
159 1.1 cgd * Bourne shell like tokenizing
160 1.1 cgd * Return:
161 1.1 cgd * -1: Internal error
162 1.1 cgd * 3: Quoted return
163 1.1 cgd * 2: Unmatched double quote
164 1.1 cgd * 1: Unmatched single quote
165 1.5 simonb * 0: Ok
166 1.1 cgd */
167 1.1 cgd public int
168 1.1 cgd tok_line(tok, line, argc, argv)
169 1.1 cgd Tokenizer *tok;
170 1.1 cgd const char* line;
171 1.1 cgd int *argc;
172 1.1 cgd char ***argv;
173 1.1 cgd {
174 1.1 cgd const char *ptr;
175 1.1 cgd
176 1.4 christos for (;;) {
177 1.1 cgd switch (*(ptr = line++)) {
178 1.1 cgd case '\'':
179 1.1 cgd tok->flags |= TOK_KEEP;
180 1.1 cgd tok->flags &= ~TOK_EAT;
181 1.1 cgd switch (tok->quote) {
182 1.1 cgd case Q_none:
183 1.1 cgd tok->quote = Q_single; /* Enter single quote mode */
184 1.1 cgd break;
185 1.1 cgd
186 1.1 cgd case Q_single: /* Exit single quote mode */
187 1.1 cgd tok->quote = Q_none;
188 1.1 cgd break;
189 1.1 cgd
190 1.1 cgd case Q_one: /* Quote this ' */
191 1.1 cgd tok->quote = Q_none;
192 1.1 cgd *tok->wptr++ = *ptr;
193 1.1 cgd break;
194 1.1 cgd
195 1.1 cgd case Q_double: /* Stay in double quote mode */
196 1.1 cgd *tok->wptr++ = *ptr;
197 1.1 cgd break;
198 1.1 cgd
199 1.1 cgd case Q_doubleone: /* Quote this ' */
200 1.1 cgd tok->quote = Q_double;
201 1.1 cgd *tok->wptr++ = *ptr;
202 1.1 cgd break;
203 1.1 cgd
204 1.1 cgd default:
205 1.1 cgd return(-1);
206 1.1 cgd }
207 1.1 cgd break;
208 1.1 cgd
209 1.1 cgd case '"':
210 1.1 cgd tok->flags &= ~TOK_EAT;
211 1.1 cgd tok->flags |= TOK_KEEP;
212 1.1 cgd switch (tok->quote) {
213 1.1 cgd case Q_none: /* Enter double quote mode */
214 1.1 cgd tok->quote = Q_double;
215 1.1 cgd break;
216 1.1 cgd
217 1.1 cgd case Q_double:
218 1.1 cgd tok->quote = Q_none; /* Exit double quote mode */
219 1.1 cgd break;
220 1.1 cgd
221 1.1 cgd case Q_one: /* Quote this " */
222 1.1 cgd tok->quote = Q_none;
223 1.1 cgd *tok->wptr++ = *ptr;
224 1.1 cgd break;
225 1.1 cgd
226 1.1 cgd case Q_single: /* Stay in single quote mode */
227 1.1 cgd *tok->wptr++ = *ptr;
228 1.1 cgd break;
229 1.1 cgd
230 1.1 cgd case Q_doubleone: /* Quote this " */
231 1.1 cgd tok->quote = Q_double;
232 1.1 cgd *tok->wptr++ = *ptr;
233 1.1 cgd break;
234 1.1 cgd
235 1.5 simonb default:
236 1.1 cgd return(-1);
237 1.1 cgd }
238 1.1 cgd break;
239 1.1 cgd
240 1.1 cgd case '\\':
241 1.1 cgd tok->flags |= TOK_KEEP;
242 1.1 cgd tok->flags &= ~TOK_EAT;
243 1.1 cgd switch (tok->quote) {
244 1.1 cgd case Q_none: /* Quote next character */
245 1.1 cgd tok->quote = Q_one;
246 1.1 cgd break;
247 1.1 cgd
248 1.1 cgd case Q_double:
249 1.1 cgd tok->quote = Q_doubleone;/* Quote next character */
250 1.1 cgd break;
251 1.1 cgd
252 1.5 simonb case Q_one:
253 1.1 cgd *tok->wptr++ = *ptr;
254 1.1 cgd tok->quote = Q_none; /* Quote this, restore state */
255 1.1 cgd break;
256 1.1 cgd
257 1.1 cgd case Q_single: /* Stay in single quote mode */
258 1.1 cgd *tok->wptr++ = *ptr;
259 1.1 cgd break;
260 1.1 cgd
261 1.1 cgd case Q_doubleone: /* Quote this \ */
262 1.1 cgd tok->quote = Q_double;
263 1.1 cgd *tok->wptr++ = *ptr;
264 1.1 cgd break;
265 1.1 cgd
266 1.1 cgd default:
267 1.1 cgd return(-1);
268 1.1 cgd }
269 1.1 cgd break;
270 1.1 cgd
271 1.1 cgd case '\n':
272 1.1 cgd tok->flags &= ~TOK_EAT;
273 1.1 cgd switch (tok->quote) {
274 1.1 cgd case Q_none:
275 1.1 cgd tok_finish(tok);
276 1.1 cgd *argv = tok->argv;
277 1.1 cgd *argc = tok->argc;
278 1.1 cgd return(0);
279 1.1 cgd
280 1.1 cgd case Q_single:
281 1.1 cgd case Q_double:
282 1.1 cgd *tok->wptr++ = *ptr; /* Add the return */
283 1.1 cgd break;
284 1.5 simonb
285 1.1 cgd case Q_doubleone:
286 1.1 cgd tok->flags |= TOK_EAT;
287 1.1 cgd tok->quote = Q_double; /* Back to double, eat the '\n' */
288 1.1 cgd break;
289 1.1 cgd
290 1.1 cgd case Q_one:
291 1.1 cgd tok->flags |= TOK_EAT;
292 1.1 cgd tok->quote = Q_none; /* No quote, more eat the '\n' */
293 1.1 cgd break;
294 1.1 cgd
295 1.1 cgd default:
296 1.1 cgd return(0);
297 1.1 cgd }
298 1.1 cgd break;
299 1.1 cgd
300 1.1 cgd case '\0':
301 1.1 cgd switch (tok->quote) {
302 1.1 cgd case Q_none:
303 1.1 cgd /* Finish word and return */
304 1.1 cgd if (tok->flags & TOK_EAT) {
305 1.1 cgd tok->flags &= ~TOK_EAT;
306 1.1 cgd return 3;
307 1.1 cgd }
308 1.1 cgd tok_finish(tok);
309 1.1 cgd *argv = tok->argv;
310 1.1 cgd *argc = tok->argc;
311 1.1 cgd return(0);
312 1.1 cgd
313 1.1 cgd case Q_single:
314 1.1 cgd return(1);
315 1.1 cgd
316 1.1 cgd case Q_double:
317 1.1 cgd return(2);
318 1.1 cgd
319 1.1 cgd case Q_doubleone:
320 1.1 cgd tok->quote = Q_double;
321 1.1 cgd *tok->wptr++ = *ptr;
322 1.1 cgd break;
323 1.1 cgd
324 1.1 cgd case Q_one:
325 1.1 cgd tok->quote = Q_none;
326 1.1 cgd *tok->wptr++ = *ptr;
327 1.1 cgd break;
328 1.1 cgd
329 1.1 cgd default:
330 1.1 cgd return(-1);
331 1.1 cgd }
332 1.1 cgd break;
333 1.1 cgd
334 1.1 cgd default:
335 1.1 cgd tok->flags &= ~TOK_EAT;
336 1.1 cgd switch (tok->quote) {
337 1.1 cgd case Q_none:
338 1.1 cgd if (strchr(tok->ifs, *ptr) != NULL)
339 1.1 cgd tok_finish(tok);
340 1.1 cgd else
341 1.1 cgd *tok->wptr++ = *ptr;
342 1.1 cgd break;
343 1.1 cgd
344 1.1 cgd case Q_single:
345 1.1 cgd case Q_double:
346 1.1 cgd *tok->wptr++ = *ptr;
347 1.1 cgd break;
348 1.1 cgd
349 1.1 cgd
350 1.1 cgd case Q_doubleone:
351 1.1 cgd *tok->wptr++ = '\\';
352 1.1 cgd tok->quote = Q_double;
353 1.1 cgd *tok->wptr++ = *ptr;
354 1.1 cgd break;
355 1.1 cgd
356 1.1 cgd case Q_one:
357 1.1 cgd tok->quote = Q_none;
358 1.1 cgd *tok->wptr++ = *ptr;
359 1.1 cgd break;
360 1.1 cgd
361 1.1 cgd default:
362 1.1 cgd return(-1);
363 1.1 cgd
364 1.1 cgd }
365 1.1 cgd break;
366 1.1 cgd }
367 1.1 cgd
368 1.1 cgd if (tok->wptr >= tok->wmax - 4) {
369 1.1 cgd size_t size = tok->wmax - tok->wspace + WINCR;
370 1.1 cgd char *s = (char *) tok_realloc(tok->wspace, size);
371 1.1 cgd /*SUPPRESS 22*/
372 1.1 cgd int offs = s - tok->wspace;
373 1.1 cgd
374 1.1 cgd if (offs != 0) {
375 1.1 cgd int i;
376 1.1 cgd for (i = 0; i < tok->argc; i++)
377 1.1 cgd tok->argv[i] = tok->argv[i] + offs;
378 1.1 cgd tok->wptr = tok->wptr + offs;
379 1.1 cgd tok->wstart = tok->wstart + offs;
380 1.1 cgd tok->wmax = s + size;
381 1.1 cgd tok->wspace = s;
382 1.1 cgd }
383 1.1 cgd }
384 1.1 cgd
385 1.1 cgd if (tok->argc >= tok->amax - 4) {
386 1.1 cgd tok->amax += AINCR;
387 1.5 simonb tok->argv = (char **) tok_realloc(tok->argv,
388 1.1 cgd tok->amax * sizeof(char*));
389 1.1 cgd }
390 1.1 cgd
391 1.1 cgd }
392 1.1 cgd }
393