1 1.29 christos /* $NetBSD: tokenizer.c,v 1.29 2023/05/30 11:53:40 christos Exp $ */ 2 1.2 lukem 3 1.1 cgd /*- 4 1.1 cgd * Copyright (c) 1992, 1993 5 1.1 cgd * The Regents of the University of California. All rights reserved. 6 1.1 cgd * 7 1.1 cgd * This code is derived from software contributed to Berkeley by 8 1.1 cgd * Christos Zoulas of Cornell University. 9 1.1 cgd * 10 1.1 cgd * Redistribution and use in source and binary forms, with or without 11 1.1 cgd * modification, are permitted provided that the following conditions 12 1.1 cgd * are met: 13 1.1 cgd * 1. Redistributions of source code must retain the above copyright 14 1.1 cgd * notice, this list of conditions and the following disclaimer. 15 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 cgd * notice, this list of conditions and the following disclaimer in the 17 1.1 cgd * documentation and/or other materials provided with the distribution. 18 1.12 agc * 3. Neither the name of the University nor the names of its contributors 19 1.1 cgd * may be used to endorse or promote products derived from this software 20 1.1 cgd * without specific prior written permission. 21 1.1 cgd * 22 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 1.1 cgd * SUCH DAMAGE. 33 1.1 cgd */ 34 1.1 cgd 35 1.10 christos #include "config.h" 36 1.1 cgd #if !defined(lint) && !defined(SCCSID) 37 1.2 lukem #if 0 38 1.1 cgd static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93"; 39 1.2 lukem #else 40 1.29 christos __RCSID("$NetBSD: tokenizer.c,v 1.29 2023/05/30 11:53:40 christos Exp $"); 41 1.2 lukem #endif 42 1.1 cgd #endif /* not lint && not SCCSID */ 43 1.1 cgd 44 1.16 christos /* We build this file twice, once as NARROW, once as WIDE. */ 45 1.1 cgd /* 46 1.1 cgd * tokenize.c: Bourne shell like tokenizer 47 1.1 cgd */ 48 1.24 christos #include <stdlib.h> 49 1.1 cgd #include <string.h> 50 1.24 christos 51 1.14 lukem #include "histedit.h" 52 1.1 cgd 53 1.6 lukem typedef enum { 54 1.6 lukem Q_none, Q_single, Q_double, Q_one, Q_doubleone 55 1.6 lukem } quote_t; 56 1.1 cgd 57 1.6 lukem #define TOK_KEEP 1 58 1.6 lukem #define TOK_EAT 2 59 1.1 cgd 60 1.6 lukem #define WINCR 20 61 1.6 lukem #define AINCR 10 62 1.1 cgd 63 1.16 christos #define IFS STR("\t \n") 64 1.16 christos 65 1.6 lukem #define tok_malloc(a) malloc(a) 66 1.6 lukem #define tok_free(a) free(a) 67 1.6 lukem #define tok_realloc(a, b) realloc(a, b) 68 1.1 cgd 69 1.25 christos #ifdef NARROWCHAR 70 1.26 christos #define Char char 71 1.25 christos #define FUN(prefix, rest) prefix ## _ ## rest 72 1.25 christos #define TYPE(type) type 73 1.25 christos #define STR(x) x 74 1.25 christos #define Strchr(s, c) strchr(s, c) 75 1.25 christos #define tok_strdup(s) strdup(s) 76 1.25 christos #else 77 1.26 christos #define Char wchar_t 78 1.25 christos #define FUN(prefix, rest) prefix ## _w ## rest 79 1.25 christos #define TYPE(type) type ## W 80 1.25 christos #define STR(x) L ## x 81 1.25 christos #define Strchr(s, c) wcschr(s, c) 82 1.25 christos #define tok_strdup(s) wcsdup(s) 83 1.25 christos #endif 84 1.1 cgd 85 1.18 christos struct TYPE(tokenizer) { 86 1.16 christos Char *ifs; /* In field separator */ 87 1.21 christos size_t argc, amax; /* Current and maximum number of args */ 88 1.29 christos const Char **argv; /* Argument list */ 89 1.16 christos Char *wptr, *wmax; /* Space and limit on the word buffer */ 90 1.16 christos Char *wstart; /* Beginning of next word */ 91 1.16 christos Char *wspace; /* Space of word buffer */ 92 1.6 lukem quote_t quote; /* Quoting state */ 93 1.6 lukem int flags; /* flags; */ 94 1.1 cgd }; 95 1.1 cgd 96 1.1 cgd 97 1.28 christos static void FUN(tok,finish)(TYPE(Tokenizer) *); 98 1.1 cgd 99 1.1 cgd 100 1.16 christos /* FUN(tok,finish)(): 101 1.1 cgd * Finish a word in the tokenizer. 102 1.1 cgd */ 103 1.28 christos static void 104 1.16 christos FUN(tok,finish)(TYPE(Tokenizer) *tok) 105 1.1 cgd { 106 1.6 lukem 107 1.6 lukem *tok->wptr = '\0'; 108 1.6 lukem if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) { 109 1.6 lukem tok->argv[tok->argc++] = tok->wstart; 110 1.6 lukem tok->argv[tok->argc] = NULL; 111 1.6 lukem tok->wstart = ++tok->wptr; 112 1.6 lukem } 113 1.6 lukem tok->flags &= ~TOK_KEEP; 114 1.1 cgd } 115 1.1 cgd 116 1.1 cgd 117 1.16 christos /* FUN(tok,init)(): 118 1.1 cgd * Initialize the tokenizer 119 1.1 cgd */ 120 1.28 christos TYPE(Tokenizer) * 121 1.16 christos FUN(tok,init)(const Char *ifs) 122 1.1 cgd { 123 1.19 christos TYPE(Tokenizer) *tok = tok_malloc(sizeof(*tok)); 124 1.1 cgd 125 1.11 christos if (tok == NULL) 126 1.11 christos return NULL; 127 1.13 christos tok->ifs = tok_strdup(ifs ? ifs : IFS); 128 1.11 christos if (tok->ifs == NULL) { 129 1.19 christos tok_free(tok); 130 1.11 christos return NULL; 131 1.11 christos } 132 1.6 lukem tok->argc = 0; 133 1.6 lukem tok->amax = AINCR; 134 1.16 christos tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax); 135 1.11 christos if (tok->argv == NULL) { 136 1.19 christos tok_free(tok->ifs); 137 1.19 christos tok_free(tok); 138 1.11 christos return NULL; 139 1.11 christos } 140 1.6 lukem tok->argv[0] = NULL; 141 1.16 christos tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace)); 142 1.11 christos if (tok->wspace == NULL) { 143 1.19 christos tok_free(tok->argv); 144 1.19 christos tok_free(tok->ifs); 145 1.19 christos tok_free(tok); 146 1.11 christos return NULL; 147 1.11 christos } 148 1.6 lukem tok->wmax = tok->wspace + WINCR; 149 1.6 lukem tok->wstart = tok->wspace; 150 1.6 lukem tok->wptr = tok->wspace; 151 1.6 lukem tok->flags = 0; 152 1.6 lukem tok->quote = Q_none; 153 1.1 cgd 154 1.20 christos return tok; 155 1.1 cgd } 156 1.1 cgd 157 1.1 cgd 158 1.16 christos /* FUN(tok,reset)(): 159 1.1 cgd * Reset the tokenizer 160 1.1 cgd */ 161 1.28 christos void 162 1.16 christos FUN(tok,reset)(TYPE(Tokenizer) *tok) 163 1.1 cgd { 164 1.6 lukem 165 1.6 lukem tok->argc = 0; 166 1.6 lukem tok->wstart = tok->wspace; 167 1.6 lukem tok->wptr = tok->wspace; 168 1.6 lukem tok->flags = 0; 169 1.6 lukem tok->quote = Q_none; 170 1.1 cgd } 171 1.1 cgd 172 1.1 cgd 173 1.16 christos /* FUN(tok,end)(): 174 1.1 cgd * Clean up 175 1.1 cgd */ 176 1.28 christos void 177 1.16 christos FUN(tok,end)(TYPE(Tokenizer) *tok) 178 1.1 cgd { 179 1.6 lukem 180 1.19 christos tok_free(tok->ifs); 181 1.19 christos tok_free(tok->wspace); 182 1.19 christos tok_free(tok->argv); 183 1.19 christos tok_free(tok); 184 1.1 cgd } 185 1.1 cgd 186 1.1 cgd 187 1.1 cgd 188 1.16 christos /* FUN(tok,line)(): 189 1.14 lukem * Bourne shell (sh(1)) like tokenizing 190 1.14 lukem * Arguments: 191 1.16 christos * tok current tokenizer state (setup with FUN(tok,init)()) 192 1.14 lukem * line line to parse 193 1.14 lukem * Returns: 194 1.14 lukem * -1 Internal error 195 1.14 lukem * 3 Quoted return 196 1.14 lukem * 2 Unmatched double quote 197 1.14 lukem * 1 Unmatched single quote 198 1.14 lukem * 0 Ok 199 1.14 lukem * Modifies (if return value is 0): 200 1.14 lukem * argc number of arguments 201 1.14 lukem * argv argument array 202 1.14 lukem * cursorc if !NULL, argv element containing cursor 203 1.14 lukem * cursorv if !NULL, offset in argv[cursorc] of cursor 204 1.1 cgd */ 205 1.28 christos int 206 1.17 christos FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line, 207 1.16 christos int *argc, const Char ***argv, int *cursorc, int *cursoro) 208 1.1 cgd { 209 1.16 christos const Char *ptr; 210 1.14 lukem int cc, co; 211 1.1 cgd 212 1.14 lukem cc = co = -1; 213 1.14 lukem ptr = line->buffer; 214 1.14 lukem for (ptr = line->buffer; ;ptr++) { 215 1.14 lukem if (ptr >= line->lastchar) 216 1.16 christos ptr = STR(""); 217 1.14 lukem if (ptr == line->cursor) { 218 1.21 christos cc = (int)tok->argc; 219 1.15 christos co = (int)(tok->wptr - tok->wstart); 220 1.14 lukem } 221 1.14 lukem switch (*ptr) { 222 1.6 lukem case '\'': 223 1.6 lukem tok->flags |= TOK_KEEP; 224 1.6 lukem tok->flags &= ~TOK_EAT; 225 1.6 lukem switch (tok->quote) { 226 1.6 lukem case Q_none: 227 1.6 lukem tok->quote = Q_single; /* Enter single quote 228 1.6 lukem * mode */ 229 1.6 lukem break; 230 1.6 lukem 231 1.6 lukem case Q_single: /* Exit single quote mode */ 232 1.6 lukem tok->quote = Q_none; 233 1.6 lukem break; 234 1.6 lukem 235 1.6 lukem case Q_one: /* Quote this ' */ 236 1.6 lukem tok->quote = Q_none; 237 1.6 lukem *tok->wptr++ = *ptr; 238 1.6 lukem break; 239 1.6 lukem 240 1.6 lukem case Q_double: /* Stay in double quote mode */ 241 1.6 lukem *tok->wptr++ = *ptr; 242 1.6 lukem break; 243 1.6 lukem 244 1.6 lukem case Q_doubleone: /* Quote this ' */ 245 1.6 lukem tok->quote = Q_double; 246 1.6 lukem *tok->wptr++ = *ptr; 247 1.6 lukem break; 248 1.6 lukem 249 1.6 lukem default: 250 1.20 christos return -1; 251 1.6 lukem } 252 1.6 lukem break; 253 1.6 lukem 254 1.6 lukem case '"': 255 1.6 lukem tok->flags &= ~TOK_EAT; 256 1.6 lukem tok->flags |= TOK_KEEP; 257 1.6 lukem switch (tok->quote) { 258 1.6 lukem case Q_none: /* Enter double quote mode */ 259 1.6 lukem tok->quote = Q_double; 260 1.6 lukem break; 261 1.6 lukem 262 1.6 lukem case Q_double: /* Exit double quote mode */ 263 1.6 lukem tok->quote = Q_none; 264 1.6 lukem break; 265 1.6 lukem 266 1.6 lukem case Q_one: /* Quote this " */ 267 1.6 lukem tok->quote = Q_none; 268 1.6 lukem *tok->wptr++ = *ptr; 269 1.6 lukem break; 270 1.6 lukem 271 1.6 lukem case Q_single: /* Stay in single quote mode */ 272 1.6 lukem *tok->wptr++ = *ptr; 273 1.6 lukem break; 274 1.6 lukem 275 1.6 lukem case Q_doubleone: /* Quote this " */ 276 1.6 lukem tok->quote = Q_double; 277 1.6 lukem *tok->wptr++ = *ptr; 278 1.6 lukem break; 279 1.6 lukem 280 1.6 lukem default: 281 1.20 christos return -1; 282 1.6 lukem } 283 1.6 lukem break; 284 1.6 lukem 285 1.6 lukem case '\\': 286 1.6 lukem tok->flags |= TOK_KEEP; 287 1.6 lukem tok->flags &= ~TOK_EAT; 288 1.6 lukem switch (tok->quote) { 289 1.6 lukem case Q_none: /* Quote next character */ 290 1.6 lukem tok->quote = Q_one; 291 1.6 lukem break; 292 1.6 lukem 293 1.6 lukem case Q_double: /* Quote next character */ 294 1.6 lukem tok->quote = Q_doubleone; 295 1.6 lukem break; 296 1.6 lukem 297 1.6 lukem case Q_one: /* Quote this, restore state */ 298 1.6 lukem *tok->wptr++ = *ptr; 299 1.6 lukem tok->quote = Q_none; 300 1.6 lukem break; 301 1.6 lukem 302 1.6 lukem case Q_single: /* Stay in single quote mode */ 303 1.6 lukem *tok->wptr++ = *ptr; 304 1.6 lukem break; 305 1.6 lukem 306 1.6 lukem case Q_doubleone: /* Quote this \ */ 307 1.6 lukem tok->quote = Q_double; 308 1.6 lukem *tok->wptr++ = *ptr; 309 1.6 lukem break; 310 1.6 lukem 311 1.6 lukem default: 312 1.20 christos return -1; 313 1.6 lukem } 314 1.6 lukem break; 315 1.6 lukem 316 1.6 lukem case '\n': 317 1.6 lukem tok->flags &= ~TOK_EAT; 318 1.6 lukem switch (tok->quote) { 319 1.6 lukem case Q_none: 320 1.14 lukem goto tok_line_outok; 321 1.6 lukem 322 1.6 lukem case Q_single: 323 1.6 lukem case Q_double: 324 1.6 lukem *tok->wptr++ = *ptr; /* Add the return */ 325 1.6 lukem break; 326 1.6 lukem 327 1.6 lukem case Q_doubleone: /* Back to double, eat the '\n' */ 328 1.6 lukem tok->flags |= TOK_EAT; 329 1.6 lukem tok->quote = Q_double; 330 1.6 lukem break; 331 1.6 lukem 332 1.6 lukem case Q_one: /* No quote, more eat the '\n' */ 333 1.6 lukem tok->flags |= TOK_EAT; 334 1.6 lukem tok->quote = Q_none; 335 1.6 lukem break; 336 1.6 lukem 337 1.6 lukem default: 338 1.20 christos return 0; 339 1.6 lukem } 340 1.6 lukem break; 341 1.6 lukem 342 1.6 lukem case '\0': 343 1.6 lukem switch (tok->quote) { 344 1.6 lukem case Q_none: 345 1.6 lukem /* Finish word and return */ 346 1.6 lukem if (tok->flags & TOK_EAT) { 347 1.6 lukem tok->flags &= ~TOK_EAT; 348 1.20 christos return 3; 349 1.6 lukem } 350 1.14 lukem goto tok_line_outok; 351 1.6 lukem 352 1.6 lukem case Q_single: 353 1.20 christos return 1; 354 1.6 lukem 355 1.6 lukem case Q_double: 356 1.20 christos return 2; 357 1.6 lukem 358 1.6 lukem case Q_doubleone: 359 1.6 lukem tok->quote = Q_double; 360 1.6 lukem *tok->wptr++ = *ptr; 361 1.6 lukem break; 362 1.6 lukem 363 1.6 lukem case Q_one: 364 1.6 lukem tok->quote = Q_none; 365 1.6 lukem *tok->wptr++ = *ptr; 366 1.6 lukem break; 367 1.6 lukem 368 1.6 lukem default: 369 1.20 christos return -1; 370 1.6 lukem } 371 1.6 lukem break; 372 1.6 lukem 373 1.6 lukem default: 374 1.6 lukem tok->flags &= ~TOK_EAT; 375 1.6 lukem switch (tok->quote) { 376 1.6 lukem case Q_none: 377 1.16 christos if (Strchr(tok->ifs, *ptr) != NULL) 378 1.16 christos FUN(tok,finish)(tok); 379 1.6 lukem else 380 1.6 lukem *tok->wptr++ = *ptr; 381 1.6 lukem break; 382 1.6 lukem 383 1.6 lukem case Q_single: 384 1.6 lukem case Q_double: 385 1.6 lukem *tok->wptr++ = *ptr; 386 1.6 lukem break; 387 1.6 lukem 388 1.6 lukem 389 1.6 lukem case Q_doubleone: 390 1.6 lukem *tok->wptr++ = '\\'; 391 1.6 lukem tok->quote = Q_double; 392 1.6 lukem *tok->wptr++ = *ptr; 393 1.6 lukem break; 394 1.6 lukem 395 1.6 lukem case Q_one: 396 1.6 lukem tok->quote = Q_none; 397 1.6 lukem *tok->wptr++ = *ptr; 398 1.6 lukem break; 399 1.1 cgd 400 1.6 lukem default: 401 1.20 christos return -1; 402 1.1 cgd 403 1.6 lukem } 404 1.6 lukem break; 405 1.6 lukem } 406 1.1 cgd 407 1.6 lukem if (tok->wptr >= tok->wmax - 4) { 408 1.21 christos size_t size = (size_t)(tok->wmax - tok->wspace + WINCR); 409 1.16 christos Char *s = tok_realloc(tok->wspace, 410 1.16 christos size * sizeof(*s)); 411 1.7 christos if (s == NULL) 412 1.20 christos return -1; 413 1.6 lukem 414 1.8 christos if (s != tok->wspace) { 415 1.21 christos size_t i; 416 1.8 christos for (i = 0; i < tok->argc; i++) { 417 1.8 christos tok->argv[i] = 418 1.8 christos (tok->argv[i] - tok->wspace) + s; 419 1.8 christos } 420 1.8 christos tok->wptr = (tok->wptr - tok->wspace) + s; 421 1.8 christos tok->wstart = (tok->wstart - tok->wspace) + s; 422 1.6 lukem tok->wspace = s; 423 1.6 lukem } 424 1.9 christos tok->wmax = s + size; 425 1.6 lukem } 426 1.6 lukem if (tok->argc >= tok->amax - 4) { 427 1.29 christos const Char **p; 428 1.6 lukem tok->amax += AINCR; 429 1.16 christos p = tok_realloc(tok->argv, tok->amax * sizeof(*p)); 430 1.23 christos if (p == NULL) { 431 1.23 christos tok->amax -= AINCR; 432 1.20 christos return -1; 433 1.23 christos } 434 1.7 christos tok->argv = p; 435 1.6 lukem } 436 1.1 cgd } 437 1.14 lukem tok_line_outok: 438 1.14 lukem if (cc == -1 && co == -1) { 439 1.21 christos cc = (int)tok->argc; 440 1.15 christos co = (int)(tok->wptr - tok->wstart); 441 1.14 lukem } 442 1.14 lukem if (cursorc != NULL) 443 1.14 lukem *cursorc = cc; 444 1.14 lukem if (cursoro != NULL) 445 1.14 lukem *cursoro = co; 446 1.16 christos FUN(tok,finish)(tok); 447 1.29 christos *argv = tok->argv; 448 1.21 christos *argc = (int)tok->argc; 449 1.20 christos return 0; 450 1.14 lukem } 451 1.14 lukem 452 1.16 christos /* FUN(tok,str)(): 453 1.14 lukem * Simpler version of tok_line, taking a NUL terminated line 454 1.14 lukem * and splitting into words, ignoring cursor state. 455 1.14 lukem */ 456 1.28 christos int 457 1.16 christos FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc, 458 1.16 christos const Char ***argv) 459 1.14 lukem { 460 1.17 christos TYPE(LineInfo) li; 461 1.14 lukem 462 1.14 lukem memset(&li, 0, sizeof(li)); 463 1.14 lukem li.buffer = line; 464 1.16 christos li.cursor = li.lastchar = Strchr(line, '\0'); 465 1.22 christos return FUN(tok,line)(tok, &li, argc, argv, NULL, NULL); 466 1.1 cgd } 467