tokenizer.c revision 1.24 1 /* $NetBSD: tokenizer.c,v 1.24 2016/02/17 19:47:49 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Christos Zoulas of Cornell University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include "config.h"
36 #if !defined(lint) && !defined(SCCSID)
37 #if 0
38 static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93";
39 #else
40 __RCSID("$NetBSD: tokenizer.c,v 1.24 2016/02/17 19:47:49 christos Exp $");
41 #endif
42 #endif /* not lint && not SCCSID */
43
44 /* We build this file twice, once as NARROW, once as WIDE. */
45 /*
46 * tokenize.c: Bourne shell like tokenizer
47 */
48 #include <stdlib.h>
49 #include <string.h>
50
51 #include "histedit.h"
52 #include "chartype.h"
53
54 typedef enum {
55 Q_none, Q_single, Q_double, Q_one, Q_doubleone
56 } quote_t;
57
58 #define TOK_KEEP 1
59 #define TOK_EAT 2
60
61 #define WINCR 20
62 #define AINCR 10
63
64 #define IFS STR("\t \n")
65
66 #define tok_malloc(a) malloc(a)
67 #define tok_free(a) free(a)
68 #define tok_realloc(a, b) realloc(a, b)
69 #define tok_strdup(a) Strdup(a)
70
71
72 struct TYPE(tokenizer) {
73 Char *ifs; /* In field separator */
74 size_t argc, amax; /* Current and maximum number of args */
75 Char **argv; /* Argument list */
76 Char *wptr, *wmax; /* Space and limit on the word buffer */
77 Char *wstart; /* Beginning of next word */
78 Char *wspace; /* Space of word buffer */
79 quote_t quote; /* Quoting state */
80 int flags; /* flags; */
81 };
82
83
84 private void FUN(tok,finish)(TYPE(Tokenizer) *);
85
86
87 /* FUN(tok,finish)():
88 * Finish a word in the tokenizer.
89 */
90 private void
91 FUN(tok,finish)(TYPE(Tokenizer) *tok)
92 {
93
94 *tok->wptr = '\0';
95 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
96 tok->argv[tok->argc++] = tok->wstart;
97 tok->argv[tok->argc] = NULL;
98 tok->wstart = ++tok->wptr;
99 }
100 tok->flags &= ~TOK_KEEP;
101 }
102
103
104 /* FUN(tok,init)():
105 * Initialize the tokenizer
106 */
107 public TYPE(Tokenizer) *
108 FUN(tok,init)(const Char *ifs)
109 {
110 TYPE(Tokenizer) *tok = tok_malloc(sizeof(*tok));
111
112 if (tok == NULL)
113 return NULL;
114 tok->ifs = tok_strdup(ifs ? ifs : IFS);
115 if (tok->ifs == NULL) {
116 tok_free(tok);
117 return NULL;
118 }
119 tok->argc = 0;
120 tok->amax = AINCR;
121 tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
122 if (tok->argv == NULL) {
123 tok_free(tok->ifs);
124 tok_free(tok);
125 return NULL;
126 }
127 tok->argv[0] = NULL;
128 tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
129 if (tok->wspace == NULL) {
130 tok_free(tok->argv);
131 tok_free(tok->ifs);
132 tok_free(tok);
133 return NULL;
134 }
135 tok->wmax = tok->wspace + WINCR;
136 tok->wstart = tok->wspace;
137 tok->wptr = tok->wspace;
138 tok->flags = 0;
139 tok->quote = Q_none;
140
141 return tok;
142 }
143
144
145 /* FUN(tok,reset)():
146 * Reset the tokenizer
147 */
148 public void
149 FUN(tok,reset)(TYPE(Tokenizer) *tok)
150 {
151
152 tok->argc = 0;
153 tok->wstart = tok->wspace;
154 tok->wptr = tok->wspace;
155 tok->flags = 0;
156 tok->quote = Q_none;
157 }
158
159
160 /* FUN(tok,end)():
161 * Clean up
162 */
163 public void
164 FUN(tok,end)(TYPE(Tokenizer) *tok)
165 {
166
167 tok_free(tok->ifs);
168 tok_free(tok->wspace);
169 tok_free(tok->argv);
170 tok_free(tok);
171 }
172
173
174
175 /* FUN(tok,line)():
176 * Bourne shell (sh(1)) like tokenizing
177 * Arguments:
178 * tok current tokenizer state (setup with FUN(tok,init)())
179 * line line to parse
180 * Returns:
181 * -1 Internal error
182 * 3 Quoted return
183 * 2 Unmatched double quote
184 * 1 Unmatched single quote
185 * 0 Ok
186 * Modifies (if return value is 0):
187 * argc number of arguments
188 * argv argument array
189 * cursorc if !NULL, argv element containing cursor
190 * cursorv if !NULL, offset in argv[cursorc] of cursor
191 */
192 public int
193 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
194 int *argc, const Char ***argv, int *cursorc, int *cursoro)
195 {
196 const Char *ptr;
197 int cc, co;
198
199 cc = co = -1;
200 ptr = line->buffer;
201 for (ptr = line->buffer; ;ptr++) {
202 if (ptr >= line->lastchar)
203 ptr = STR("");
204 if (ptr == line->cursor) {
205 cc = (int)tok->argc;
206 co = (int)(tok->wptr - tok->wstart);
207 }
208 switch (*ptr) {
209 case '\'':
210 tok->flags |= TOK_KEEP;
211 tok->flags &= ~TOK_EAT;
212 switch (tok->quote) {
213 case Q_none:
214 tok->quote = Q_single; /* Enter single quote
215 * mode */
216 break;
217
218 case Q_single: /* Exit single quote mode */
219 tok->quote = Q_none;
220 break;
221
222 case Q_one: /* Quote this ' */
223 tok->quote = Q_none;
224 *tok->wptr++ = *ptr;
225 break;
226
227 case Q_double: /* Stay in double quote mode */
228 *tok->wptr++ = *ptr;
229 break;
230
231 case Q_doubleone: /* Quote this ' */
232 tok->quote = Q_double;
233 *tok->wptr++ = *ptr;
234 break;
235
236 default:
237 return -1;
238 }
239 break;
240
241 case '"':
242 tok->flags &= ~TOK_EAT;
243 tok->flags |= TOK_KEEP;
244 switch (tok->quote) {
245 case Q_none: /* Enter double quote mode */
246 tok->quote = Q_double;
247 break;
248
249 case Q_double: /* Exit double quote mode */
250 tok->quote = Q_none;
251 break;
252
253 case Q_one: /* Quote this " */
254 tok->quote = Q_none;
255 *tok->wptr++ = *ptr;
256 break;
257
258 case Q_single: /* Stay in single quote mode */
259 *tok->wptr++ = *ptr;
260 break;
261
262 case Q_doubleone: /* Quote this " */
263 tok->quote = Q_double;
264 *tok->wptr++ = *ptr;
265 break;
266
267 default:
268 return -1;
269 }
270 break;
271
272 case '\\':
273 tok->flags |= TOK_KEEP;
274 tok->flags &= ~TOK_EAT;
275 switch (tok->quote) {
276 case Q_none: /* Quote next character */
277 tok->quote = Q_one;
278 break;
279
280 case Q_double: /* Quote next character */
281 tok->quote = Q_doubleone;
282 break;
283
284 case Q_one: /* Quote this, restore state */
285 *tok->wptr++ = *ptr;
286 tok->quote = Q_none;
287 break;
288
289 case Q_single: /* Stay in single quote mode */
290 *tok->wptr++ = *ptr;
291 break;
292
293 case Q_doubleone: /* Quote this \ */
294 tok->quote = Q_double;
295 *tok->wptr++ = *ptr;
296 break;
297
298 default:
299 return -1;
300 }
301 break;
302
303 case '\n':
304 tok->flags &= ~TOK_EAT;
305 switch (tok->quote) {
306 case Q_none:
307 goto tok_line_outok;
308
309 case Q_single:
310 case Q_double:
311 *tok->wptr++ = *ptr; /* Add the return */
312 break;
313
314 case Q_doubleone: /* Back to double, eat the '\n' */
315 tok->flags |= TOK_EAT;
316 tok->quote = Q_double;
317 break;
318
319 case Q_one: /* No quote, more eat the '\n' */
320 tok->flags |= TOK_EAT;
321 tok->quote = Q_none;
322 break;
323
324 default:
325 return 0;
326 }
327 break;
328
329 case '\0':
330 switch (tok->quote) {
331 case Q_none:
332 /* Finish word and return */
333 if (tok->flags & TOK_EAT) {
334 tok->flags &= ~TOK_EAT;
335 return 3;
336 }
337 goto tok_line_outok;
338
339 case Q_single:
340 return 1;
341
342 case Q_double:
343 return 2;
344
345 case Q_doubleone:
346 tok->quote = Q_double;
347 *tok->wptr++ = *ptr;
348 break;
349
350 case Q_one:
351 tok->quote = Q_none;
352 *tok->wptr++ = *ptr;
353 break;
354
355 default:
356 return -1;
357 }
358 break;
359
360 default:
361 tok->flags &= ~TOK_EAT;
362 switch (tok->quote) {
363 case Q_none:
364 if (Strchr(tok->ifs, *ptr) != NULL)
365 FUN(tok,finish)(tok);
366 else
367 *tok->wptr++ = *ptr;
368 break;
369
370 case Q_single:
371 case Q_double:
372 *tok->wptr++ = *ptr;
373 break;
374
375
376 case Q_doubleone:
377 *tok->wptr++ = '\\';
378 tok->quote = Q_double;
379 *tok->wptr++ = *ptr;
380 break;
381
382 case Q_one:
383 tok->quote = Q_none;
384 *tok->wptr++ = *ptr;
385 break;
386
387 default:
388 return -1;
389
390 }
391 break;
392 }
393
394 if (tok->wptr >= tok->wmax - 4) {
395 size_t size = (size_t)(tok->wmax - tok->wspace + WINCR);
396 Char *s = tok_realloc(tok->wspace,
397 size * sizeof(*s));
398 if (s == NULL)
399 return -1;
400
401 if (s != tok->wspace) {
402 size_t i;
403 for (i = 0; i < tok->argc; i++) {
404 tok->argv[i] =
405 (tok->argv[i] - tok->wspace) + s;
406 }
407 tok->wptr = (tok->wptr - tok->wspace) + s;
408 tok->wstart = (tok->wstart - tok->wspace) + s;
409 tok->wspace = s;
410 }
411 tok->wmax = s + size;
412 }
413 if (tok->argc >= tok->amax - 4) {
414 Char **p;
415 tok->amax += AINCR;
416 p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
417 if (p == NULL) {
418 tok->amax -= AINCR;
419 return -1;
420 }
421 tok->argv = p;
422 }
423 }
424 tok_line_outok:
425 if (cc == -1 && co == -1) {
426 cc = (int)tok->argc;
427 co = (int)(tok->wptr - tok->wstart);
428 }
429 if (cursorc != NULL)
430 *cursorc = cc;
431 if (cursoro != NULL)
432 *cursoro = co;
433 FUN(tok,finish)(tok);
434 *argv = (const Char **)tok->argv;
435 *argc = (int)tok->argc;
436 return 0;
437 }
438
439 /* FUN(tok,str)():
440 * Simpler version of tok_line, taking a NUL terminated line
441 * and splitting into words, ignoring cursor state.
442 */
443 public int
444 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
445 const Char ***argv)
446 {
447 TYPE(LineInfo) li;
448
449 memset(&li, 0, sizeof(li));
450 li.buffer = line;
451 li.cursor = li.lastchar = Strchr(line, '\0');
452 return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL);
453 }
454