1 1.1 cgd /* 2 1.1 cgd * regcomp and regexec -- regsub and regerror are elsewhere 3 1.1 cgd * 4 1.1 cgd * Copyright (c) 1986 by University of Toronto. 5 1.1 cgd * Written by Henry Spencer. Not derived from licensed software. 6 1.1 cgd * 7 1.1 cgd * Permission is granted to anyone to use this software for any 8 1.1 cgd * purpose on any computer system, and to redistribute it freely, 9 1.1 cgd * subject to the following restrictions: 10 1.1 cgd * 11 1.1 cgd * 1. The author is not responsible for the consequences of use of 12 1.1 cgd * this software, no matter how awful, even if they arise 13 1.1 cgd * from defects in it. 14 1.1 cgd * 15 1.1 cgd * 2. The origin of this software must not be misrepresented, either 16 1.1 cgd * by explicit claim or by omission. 17 1.1 cgd * 18 1.1 cgd * 3. Altered versions must be plainly marked as such, and must not 19 1.1 cgd * be misrepresented as being the original software. 20 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 21 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | 22 1.1 cgd *** to assist in implementing egrep. 23 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 24 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching 25 1.1 cgd *** as in BSD grep and ex. 26 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, 27 1.1 cgd *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. 28 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, 29 1.1 cgd *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. 30 1.1 cgd * 31 1.1 cgd * Beware that some of this code is subtly aware of the way operator 32 1.1 cgd * precedence is structured in regular expressions. Serious changes in 33 1.1 cgd * regular-expression syntax might require a total rethink. 34 1.1 cgd */ 35 1.2 mycroft 36 1.7 lukem #include <sys/cdefs.h> 37 1.14 msaitoh #if defined(LIBC_SCCS) && !defined(lint) 38 1.19 christos __RCSID("$NetBSD: regexp.c,v 1.19 2016/01/26 16:05:18 christos Exp $"); 39 1.14 msaitoh #endif /* LIBC_SCCS and not lint */ 40 1.2 mycroft 41 1.12 lukem #include <ctype.h> 42 1.1 cgd #include <regexp.h> 43 1.1 cgd #include <stdio.h> 44 1.1 cgd #include <stdlib.h> 45 1.1 cgd #include <string.h> 46 1.1 cgd #include "regmagic.h" 47 1.1 cgd 48 1.1 cgd /* 49 1.1 cgd * The "internal use only" fields in regexp.h are present to pass info from 50 1.1 cgd * compile to execute that permits the execute phase to run lots faster on 51 1.1 cgd * simple cases. They are: 52 1.1 cgd * 53 1.1 cgd * regstart char that must begin a match; '\0' if none obvious 54 1.1 cgd * reganch is the match anchored (at beginning-of-line only)? 55 1.1 cgd * regmust string (pointer into program) that match must include, or NULL 56 1.1 cgd * regmlen length of regmust string 57 1.1 cgd * 58 1.1 cgd * Regstart and reganch permit very fast decisions on suitable starting points 59 1.1 cgd * for a match, cutting down the work a lot. Regmust permits fast rejection 60 1.1 cgd * of lines that cannot possibly match. The regmust tests are costly enough 61 1.1 cgd * that regcomp() supplies a regmust only if the r.e. contains something 62 1.1 cgd * potentially expensive (at present, the only such thing detected is * or + 63 1.1 cgd * at the start of the r.e., which can involve a lot of backup). Regmlen is 64 1.1 cgd * supplied because the test in regexec() needs it and regcomp() is computing 65 1.1 cgd * it anyway. 66 1.1 cgd */ 67 1.1 cgd 68 1.1 cgd /* 69 1.1 cgd * Structure for regexp "program". This is essentially a linear encoding 70 1.1 cgd * of a nondeterministic finite-state machine (aka syntax charts or 71 1.1 cgd * "railroad normal form" in parsing technology). Each node is an opcode 72 1.1 cgd * plus a "next" pointer, possibly plus an operand. "Next" pointers of 73 1.1 cgd * all nodes except BRANCH implement concatenation; a "next" pointer with 74 1.1 cgd * a BRANCH on both ends of it is connecting two alternatives. (Here we 75 1.1 cgd * have one of the subtle syntax dependencies: an individual BRANCH (as 76 1.1 cgd * opposed to a collection of them) is never concatenated with anything 77 1.1 cgd * because of operator precedence.) The operand of some types of node is 78 1.1 cgd * a literal string; for others, it is a node leading into a sub-FSM. In 79 1.1 cgd * particular, the operand of a BRANCH node is the first node of the branch. 80 1.1 cgd * (NB this is *not* a tree structure: the tail of the branch connects 81 1.1 cgd * to the thing following the set of BRANCHes.) The opcodes are: 82 1.1 cgd */ 83 1.1 cgd 84 1.1 cgd /* definition number opnd? meaning */ 85 1.1 cgd #define END 0 /* no End of program. */ 86 1.1 cgd #define BOL 1 /* no Match "" at beginning of line. */ 87 1.1 cgd #define EOL 2 /* no Match "" at end of line. */ 88 1.1 cgd #define ANY 3 /* no Match any one character. */ 89 1.1 cgd #define ANYOF 4 /* str Match any character in this string. */ 90 1.1 cgd #define ANYBUT 5 /* str Match any character not in this string. */ 91 1.1 cgd #define BRANCH 6 /* node Match this alternative, or the next... */ 92 1.1 cgd #define BACK 7 /* no Match "", "next" ptr points backward. */ 93 1.1 cgd #define EXACTLY 8 /* str Match this string. */ 94 1.1 cgd #define NOTHING 9 /* no Match empty string. */ 95 1.1 cgd #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 96 1.1 cgd #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 97 1.1 cgd #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ 98 1.1 cgd #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ 99 1.1 cgd #define OPEN 20 /* no Mark this point in input as start of #n. */ 100 1.1 cgd /* OPEN+1 is number 1, etc. */ 101 1.1 cgd #define CLOSE 30 /* no Analogous to OPEN. */ 102 1.1 cgd 103 1.1 cgd /* 104 1.1 cgd * Opcode notes: 105 1.1 cgd * 106 1.1 cgd * BRANCH The set of branches constituting a single choice are hooked 107 1.1 cgd * together with their "next" pointers, since precedence prevents 108 1.1 cgd * anything being concatenated to any individual branch. The 109 1.1 cgd * "next" pointer of the last BRANCH in a choice points to the 110 1.1 cgd * thing following the whole choice. This is also where the 111 1.1 cgd * final "next" pointer of each individual branch points; each 112 1.1 cgd * branch starts with the operand node of a BRANCH node. 113 1.1 cgd * 114 1.1 cgd * BACK Normal "next" pointers all implicitly point forward; BACK 115 1.1 cgd * exists to make loop structures possible. 116 1.1 cgd * 117 1.1 cgd * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 118 1.1 cgd * BRANCH structures using BACK. Simple cases (one character 119 1.1 cgd * per match) are implemented with STAR and PLUS for speed 120 1.1 cgd * and to minimize recursive plunges. 121 1.1 cgd * 122 1.1 cgd * OPEN,CLOSE ...are numbered at compile time. 123 1.1 cgd */ 124 1.1 cgd 125 1.1 cgd /* 126 1.1 cgd * A node is one char of opcode followed by two chars of "next" pointer. 127 1.1 cgd * "Next" pointers are stored as two 8-bit pieces, high order first. The 128 1.1 cgd * value is a positive offset from the opcode of the node containing it. 129 1.1 cgd * An operand, if any, simply follows the node. (Note that much of the 130 1.1 cgd * code generation knows about this implicit relationship.) 131 1.1 cgd * 132 1.1 cgd * Using two bytes for the "next" pointer is vast overkill for most things, 133 1.1 cgd * but allows patterns to get big without disasters. 134 1.1 cgd */ 135 1.1 cgd #define OP(p) (*(p)) 136 1.1 cgd #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 137 1.1 cgd #define OPERAND(p) ((p) + 3) 138 1.1 cgd 139 1.1 cgd /* 140 1.1 cgd * See regmagic.h for one further detail of program structure. 141 1.1 cgd */ 142 1.1 cgd 143 1.1 cgd 144 1.1 cgd /* 145 1.1 cgd * Utility definitions. 146 1.1 cgd */ 147 1.1 cgd #ifndef CHARBITS 148 1.1 cgd #define UCHARAT(p) ((int)*(unsigned char *)(p)) 149 1.1 cgd #else 150 1.1 cgd #define UCHARAT(p) ((int)*(p)&CHARBITS) 151 1.1 cgd #endif 152 1.1 cgd 153 1.1 cgd #define FAIL(m) { regerror(m); return(NULL); } 154 1.1 cgd #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 155 1.1 cgd 156 1.1 cgd /* 157 1.1 cgd * Flags to be passed up and down. 158 1.1 cgd */ 159 1.1 cgd #define HASWIDTH 01 /* Known never to match null string. */ 160 1.1 cgd #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 161 1.1 cgd #define SPSTART 04 /* Starts with * or +. */ 162 1.1 cgd #define WORST 0 /* Worst case. */ 163 1.1 cgd 164 1.1 cgd /* 165 1.1 cgd * Global work variables for regcomp(). 166 1.1 cgd */ 167 1.1 cgd static char *regparse; /* Input-scan pointer. */ 168 1.1 cgd static int regnpar; /* () count. */ 169 1.1 cgd static char regdummy; 170 1.1 cgd static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 171 1.1 cgd static long regsize; /* Code size. */ 172 1.1 cgd 173 1.1 cgd /* 174 1.1 cgd * Forward declarations for regcomp()'s friends. 175 1.1 cgd */ 176 1.1 cgd #ifndef STATIC 177 1.1 cgd #define STATIC static 178 1.1 cgd #endif 179 1.4 pk STATIC char *reg __P((int, int *)); 180 1.4 pk STATIC char *regbranch __P((int *)); 181 1.4 pk STATIC char *regpiece __P((int *)); 182 1.4 pk STATIC char *regatom __P((int *)); 183 1.10 christos STATIC char *regnode __P((int)); 184 1.4 pk STATIC char *regnext __P((char *)); 185 1.10 christos STATIC void regc __P((int)); 186 1.10 christos STATIC void reginsert __P((int, char *)); 187 1.4 pk STATIC void regtail __P((char *, char *)); 188 1.4 pk STATIC void regoptail __P((char *, char *)); 189 1.1 cgd #ifdef STRCSPN 190 1.4 pk STATIC int strcspn __P((char *, char *)); 191 1.1 cgd #endif 192 1.1 cgd 193 1.1 cgd /* 194 1.1 cgd - regcomp - compile a regular expression into internal code 195 1.1 cgd * 196 1.1 cgd * We can't allocate space until we know how big the compiled form will be, 197 1.1 cgd * but we can't compile it (and thus know how big it is) until we've got a 198 1.1 cgd * place to put the code. So we cheat: we compile it twice, once with code 199 1.1 cgd * generation turned off and size counting turned on, and once "for real". 200 1.1 cgd * This also means that we don't allocate space until we are sure that the 201 1.1 cgd * thing really will compile successfully, and we never have to move the 202 1.1 cgd * code and thus invalidate pointers into it. (Note that it has to be in 203 1.1 cgd * one piece because free() must be able to free it all.) 204 1.1 cgd * 205 1.1 cgd * Beware that the optimization-preparation code in here knows about some 206 1.1 cgd * of the structure of the compiled regexp. 207 1.1 cgd */ 208 1.1 cgd regexp * 209 1.15 thorpej __compat_regcomp(expn) 210 1.15 thorpej const char *expn; 211 1.1 cgd { 212 1.8 perry regexp *r; 213 1.8 perry char *scan; 214 1.8 perry char *longest; 215 1.8 perry int len; 216 1.1 cgd int flags; 217 1.1 cgd 218 1.15 thorpej if (expn == NULL) 219 1.1 cgd FAIL("NULL argument"); 220 1.1 cgd 221 1.1 cgd /* First pass: determine size, legality. */ 222 1.1 cgd #ifdef notdef 223 1.15 thorpej if (expn[0] == '.' && expn[1] == '*') expn += 2; /* aid grep */ 224 1.1 cgd #endif 225 1.10 christos /* LINTED const castaway */ 226 1.15 thorpej regparse = (char *)expn; 227 1.1 cgd regnpar = 1; 228 1.1 cgd regsize = 0L; 229 1.1 cgd regcode = ®dummy; 230 1.1 cgd regc(MAGIC); 231 1.1 cgd if (reg(0, &flags) == NULL) 232 1.1 cgd return(NULL); 233 1.1 cgd 234 1.1 cgd /* Small enough for pointer-storage convention? */ 235 1.1 cgd if (regsize >= 32767L) /* Probably could be 65535L. */ 236 1.1 cgd FAIL("regexp too big"); 237 1.1 cgd 238 1.1 cgd /* Allocate space. */ 239 1.19 christos r = malloc(sizeof(regexp) + (unsigned)regsize); 240 1.1 cgd if (r == NULL) 241 1.1 cgd FAIL("out of space"); 242 1.1 cgd 243 1.1 cgd /* Second pass: emit code. */ 244 1.10 christos /* LINTED const castaway */ 245 1.15 thorpej regparse = (char *)expn; 246 1.1 cgd regnpar = 1; 247 1.1 cgd regcode = r->program; 248 1.1 cgd regc(MAGIC); 249 1.19 christos if (reg(0, &flags) == NULL) { 250 1.19 christos free(r); 251 1.1 cgd return(NULL); 252 1.19 christos } 253 1.1 cgd 254 1.1 cgd /* Dig out information for optimizations. */ 255 1.1 cgd r->regstart = '\0'; /* Worst-case defaults. */ 256 1.1 cgd r->reganch = 0; 257 1.1 cgd r->regmust = NULL; 258 1.1 cgd r->regmlen = 0; 259 1.1 cgd scan = r->program+1; /* First BRANCH. */ 260 1.1 cgd if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 261 1.1 cgd scan = OPERAND(scan); 262 1.1 cgd 263 1.1 cgd /* Starting-point info. */ 264 1.1 cgd if (OP(scan) == EXACTLY) 265 1.1 cgd r->regstart = *OPERAND(scan); 266 1.1 cgd else if (OP(scan) == BOL) 267 1.1 cgd r->reganch++; 268 1.1 cgd 269 1.1 cgd /* 270 1.1 cgd * If there's something expensive in the r.e., find the 271 1.1 cgd * longest literal string that must appear and make it the 272 1.1 cgd * regmust. Resolve ties in favor of later strings, since 273 1.1 cgd * the regstart check works with the beginning of the r.e. 274 1.1 cgd * and avoiding duplication strengthens checking. Not a 275 1.1 cgd * strong reason, but sufficient in the absence of others. 276 1.1 cgd */ 277 1.1 cgd if (flags&SPSTART) { 278 1.1 cgd longest = NULL; 279 1.1 cgd len = 0; 280 1.1 cgd for (; scan != NULL; scan = regnext(scan)) 281 1.15 thorpej if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) { 282 1.1 cgd longest = OPERAND(scan); 283 1.1 cgd len = strlen(OPERAND(scan)); 284 1.1 cgd } 285 1.1 cgd r->regmust = longest; 286 1.1 cgd r->regmlen = len; 287 1.1 cgd } 288 1.1 cgd } 289 1.1 cgd 290 1.1 cgd return(r); 291 1.1 cgd } 292 1.1 cgd 293 1.1 cgd /* 294 1.1 cgd - reg - regular expression, i.e. main body or parenthesized thing 295 1.1 cgd * 296 1.1 cgd * Caller must absorb opening parenthesis. 297 1.1 cgd * 298 1.1 cgd * Combining parenthesis handling with the base level of regular expression 299 1.1 cgd * is a trifle forced, but the need to tie the tails of the branches to what 300 1.1 cgd * follows makes it hard to avoid. 301 1.1 cgd */ 302 1.1 cgd static char * 303 1.1 cgd reg(paren, flagp) 304 1.1 cgd int paren; /* Parenthesized? */ 305 1.1 cgd int *flagp; 306 1.1 cgd { 307 1.8 perry char *ret; 308 1.8 perry char *br; 309 1.8 perry char *ender; 310 1.8 perry int parno = 0; 311 1.1 cgd int flags; 312 1.1 cgd 313 1.1 cgd *flagp = HASWIDTH; /* Tentatively. */ 314 1.1 cgd 315 1.1 cgd /* Make an OPEN node, if parenthesized. */ 316 1.1 cgd if (paren) { 317 1.1 cgd if (regnpar >= NSUBEXP) 318 1.1 cgd FAIL("too many ()"); 319 1.1 cgd parno = regnpar; 320 1.1 cgd regnpar++; 321 1.1 cgd ret = regnode(OPEN+parno); 322 1.1 cgd } else 323 1.1 cgd ret = NULL; 324 1.1 cgd 325 1.1 cgd /* Pick up the branches, linking them together. */ 326 1.1 cgd br = regbranch(&flags); 327 1.1 cgd if (br == NULL) 328 1.1 cgd return(NULL); 329 1.1 cgd if (ret != NULL) 330 1.1 cgd regtail(ret, br); /* OPEN -> first. */ 331 1.1 cgd else 332 1.1 cgd ret = br; 333 1.1 cgd if (!(flags&HASWIDTH)) 334 1.1 cgd *flagp &= ~HASWIDTH; 335 1.1 cgd *flagp |= flags&SPSTART; 336 1.1 cgd while (*regparse == '|' || *regparse == '\n') { 337 1.1 cgd regparse++; 338 1.1 cgd br = regbranch(&flags); 339 1.1 cgd if (br == NULL) 340 1.1 cgd return(NULL); 341 1.1 cgd regtail(ret, br); /* BRANCH -> BRANCH. */ 342 1.1 cgd if (!(flags&HASWIDTH)) 343 1.1 cgd *flagp &= ~HASWIDTH; 344 1.1 cgd *flagp |= flags&SPSTART; 345 1.1 cgd } 346 1.1 cgd 347 1.1 cgd /* Make a closing node, and hook it on the end. */ 348 1.11 simonb ender = regnode((paren) ? CLOSE+parno : END); 349 1.1 cgd regtail(ret, ender); 350 1.1 cgd 351 1.1 cgd /* Hook the tails of the branches to the closing node. */ 352 1.1 cgd for (br = ret; br != NULL; br = regnext(br)) 353 1.1 cgd regoptail(br, ender); 354 1.1 cgd 355 1.1 cgd /* Check for proper termination. */ 356 1.1 cgd if (paren && *regparse++ != ')') { 357 1.1 cgd FAIL("unmatched ()"); 358 1.1 cgd } else if (!paren && *regparse != '\0') { 359 1.1 cgd if (*regparse == ')') { 360 1.1 cgd FAIL("unmatched ()"); 361 1.1 cgd } else 362 1.1 cgd FAIL("junk on end"); /* "Can't happen". */ 363 1.1 cgd /* NOTREACHED */ 364 1.1 cgd } 365 1.1 cgd 366 1.1 cgd return(ret); 367 1.1 cgd } 368 1.1 cgd 369 1.1 cgd /* 370 1.1 cgd - regbranch - one alternative of an | operator 371 1.1 cgd * 372 1.1 cgd * Implements the concatenation operator. 373 1.1 cgd */ 374 1.1 cgd static char * 375 1.1 cgd regbranch(flagp) 376 1.1 cgd int *flagp; 377 1.1 cgd { 378 1.8 perry char *ret; 379 1.8 perry char *chain; 380 1.8 perry char *latest; 381 1.1 cgd int flags; 382 1.1 cgd 383 1.1 cgd *flagp = WORST; /* Tentatively. */ 384 1.1 cgd 385 1.1 cgd ret = regnode(BRANCH); 386 1.1 cgd chain = NULL; 387 1.1 cgd while (*regparse != '\0' && *regparse != ')' && 388 1.1 cgd *regparse != '\n' && *regparse != '|') { 389 1.1 cgd latest = regpiece(&flags); 390 1.1 cgd if (latest == NULL) 391 1.1 cgd return(NULL); 392 1.1 cgd *flagp |= flags&HASWIDTH; 393 1.1 cgd if (chain == NULL) /* First piece. */ 394 1.1 cgd *flagp |= flags&SPSTART; 395 1.1 cgd else 396 1.1 cgd regtail(chain, latest); 397 1.1 cgd chain = latest; 398 1.1 cgd } 399 1.1 cgd if (chain == NULL) /* Loop ran zero times. */ 400 1.1 cgd (void) regnode(NOTHING); 401 1.1 cgd 402 1.1 cgd return(ret); 403 1.1 cgd } 404 1.1 cgd 405 1.1 cgd /* 406 1.1 cgd - regpiece - something followed by possible [*+?] 407 1.1 cgd * 408 1.1 cgd * Note that the branching code sequences used for ? and the general cases 409 1.1 cgd * of * and + are somewhat optimized: they use the same NOTHING node as 410 1.1 cgd * both the endmarker for their branch list and the body of the last branch. 411 1.1 cgd * It might seem that this node could be dispensed with entirely, but the 412 1.1 cgd * endmarker role is not redundant. 413 1.1 cgd */ 414 1.1 cgd static char * 415 1.1 cgd regpiece(flagp) 416 1.1 cgd int *flagp; 417 1.1 cgd { 418 1.8 perry char *ret; 419 1.8 perry char op; 420 1.8 perry char *next; 421 1.1 cgd int flags; 422 1.1 cgd 423 1.1 cgd ret = regatom(&flags); 424 1.1 cgd if (ret == NULL) 425 1.1 cgd return(NULL); 426 1.1 cgd 427 1.1 cgd op = *regparse; 428 1.1 cgd if (!ISMULT(op)) { 429 1.1 cgd *flagp = flags; 430 1.1 cgd return(ret); 431 1.1 cgd } 432 1.1 cgd 433 1.1 cgd if (!(flags&HASWIDTH) && op != '?') 434 1.1 cgd FAIL("*+ operand could be empty"); 435 1.1 cgd *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 436 1.1 cgd 437 1.1 cgd if (op == '*' && (flags&SIMPLE)) 438 1.1 cgd reginsert(STAR, ret); 439 1.1 cgd else if (op == '*') { 440 1.1 cgd /* Emit x* as (x&|), where & means "self". */ 441 1.1 cgd reginsert(BRANCH, ret); /* Either x */ 442 1.1 cgd regoptail(ret, regnode(BACK)); /* and loop */ 443 1.1 cgd regoptail(ret, ret); /* back */ 444 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */ 445 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */ 446 1.1 cgd } else if (op == '+' && (flags&SIMPLE)) 447 1.1 cgd reginsert(PLUS, ret); 448 1.1 cgd else if (op == '+') { 449 1.1 cgd /* Emit x+ as x(&|), where & means "self". */ 450 1.1 cgd next = regnode(BRANCH); /* Either */ 451 1.1 cgd regtail(ret, next); 452 1.1 cgd regtail(regnode(BACK), ret); /* loop back */ 453 1.1 cgd regtail(next, regnode(BRANCH)); /* or */ 454 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */ 455 1.1 cgd } else if (op == '?') { 456 1.1 cgd /* Emit x? as (x|) */ 457 1.1 cgd reginsert(BRANCH, ret); /* Either x */ 458 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */ 459 1.1 cgd next = regnode(NOTHING); /* null. */ 460 1.1 cgd regtail(ret, next); 461 1.1 cgd regoptail(ret, next); 462 1.1 cgd } 463 1.1 cgd regparse++; 464 1.1 cgd if (ISMULT(*regparse)) 465 1.1 cgd FAIL("nested *?+"); 466 1.1 cgd 467 1.1 cgd return(ret); 468 1.1 cgd } 469 1.1 cgd 470 1.1 cgd /* 471 1.1 cgd - regatom - the lowest level 472 1.1 cgd * 473 1.1 cgd * Optimization: gobbles an entire sequence of ordinary characters so that 474 1.1 cgd * it can turn them into a single node, which is smaller to store and 475 1.1 cgd * faster to run. Backslashed characters are exceptions, each becoming a 476 1.1 cgd * separate node; the code is simpler that way and it's not worth fixing. 477 1.1 cgd */ 478 1.1 cgd static char * 479 1.1 cgd regatom(flagp) 480 1.1 cgd int *flagp; 481 1.1 cgd { 482 1.8 perry char *ret; 483 1.1 cgd int flags; 484 1.1 cgd 485 1.1 cgd *flagp = WORST; /* Tentatively. */ 486 1.1 cgd 487 1.1 cgd switch (*regparse++) { 488 1.1 cgd /* FIXME: these chars only have meaning at beg/end of pat? */ 489 1.1 cgd case '^': 490 1.1 cgd ret = regnode(BOL); 491 1.1 cgd break; 492 1.1 cgd case '$': 493 1.1 cgd ret = regnode(EOL); 494 1.1 cgd break; 495 1.1 cgd case '.': 496 1.1 cgd ret = regnode(ANY); 497 1.1 cgd *flagp |= HASWIDTH|SIMPLE; 498 1.1 cgd break; 499 1.1 cgd case '[': { 500 1.8 perry int class; 501 1.8 perry int classend; 502 1.1 cgd 503 1.1 cgd if (*regparse == '^') { /* Complement of range. */ 504 1.1 cgd ret = regnode(ANYBUT); 505 1.1 cgd regparse++; 506 1.1 cgd } else 507 1.1 cgd ret = regnode(ANYOF); 508 1.1 cgd if (*regparse == ']' || *regparse == '-') 509 1.1 cgd regc(*regparse++); 510 1.1 cgd while (*regparse != '\0' && *regparse != ']') { 511 1.1 cgd if (*regparse == '-') { 512 1.1 cgd regparse++; 513 1.1 cgd if (*regparse == ']' || *regparse == '\0') 514 1.1 cgd regc('-'); 515 1.1 cgd else { 516 1.1 cgd class = UCHARAT(regparse-2)+1; 517 1.1 cgd classend = UCHARAT(regparse); 518 1.1 cgd if (class > classend+1) 519 1.1 cgd FAIL("invalid [] range"); 520 1.1 cgd for (; class <= classend; class++) 521 1.1 cgd regc(class); 522 1.1 cgd regparse++; 523 1.1 cgd } 524 1.1 cgd } else 525 1.1 cgd regc(*regparse++); 526 1.1 cgd } 527 1.1 cgd regc('\0'); 528 1.1 cgd if (*regparse != ']') 529 1.1 cgd FAIL("unmatched []"); 530 1.1 cgd regparse++; 531 1.1 cgd *flagp |= HASWIDTH|SIMPLE; 532 1.1 cgd } 533 1.1 cgd break; 534 1.1 cgd case '(': 535 1.1 cgd ret = reg(1, &flags); 536 1.1 cgd if (ret == NULL) 537 1.1 cgd return(NULL); 538 1.1 cgd *flagp |= flags&(HASWIDTH|SPSTART); 539 1.1 cgd break; 540 1.1 cgd case '\0': 541 1.1 cgd case '|': 542 1.1 cgd case '\n': 543 1.1 cgd case ')': 544 1.1 cgd FAIL("internal urp"); /* Supposed to be caught earlier. */ 545 1.1 cgd case '?': 546 1.1 cgd case '+': 547 1.1 cgd case '*': 548 1.1 cgd FAIL("?+* follows nothing"); 549 1.1 cgd case '\\': 550 1.1 cgd switch (*regparse++) { 551 1.1 cgd case '\0': 552 1.1 cgd FAIL("trailing \\"); 553 1.1 cgd case '<': 554 1.1 cgd ret = regnode(WORDA); 555 1.1 cgd break; 556 1.1 cgd case '>': 557 1.1 cgd ret = regnode(WORDZ); 558 1.1 cgd break; 559 1.1 cgd /* FIXME: Someday handle \1, \2, ... */ 560 1.1 cgd default: 561 1.1 cgd /* Handle general quoted chars in exact-match routine */ 562 1.1 cgd goto de_fault; 563 1.1 cgd } 564 1.1 cgd break; 565 1.1 cgd de_fault: 566 1.10 christos /*FALLTHROUGH*/ 567 1.1 cgd default: 568 1.1 cgd /* 569 1.1 cgd * Encode a string of characters to be matched exactly. 570 1.1 cgd * 571 1.1 cgd * This is a bit tricky due to quoted chars and due to 572 1.1 cgd * '*', '+', and '?' taking the SINGLE char previous 573 1.1 cgd * as their operand. 574 1.1 cgd * 575 1.1 cgd * On entry, the char at regparse[-1] is going to go 576 1.1 cgd * into the string, no matter what it is. (It could be 577 1.1 cgd * following a \ if we are entered from the '\' case.) 578 1.11 simonb * 579 1.1 cgd * Basic idea is to pick up a good char in ch and 580 1.1 cgd * examine the next char. If it's *+? then we twiddle. 581 1.1 cgd * If it's \ then we frozzle. If it's other magic char 582 1.1 cgd * we push ch and terminate the string. If none of the 583 1.1 cgd * above, we push ch on the string and go around again. 584 1.1 cgd * 585 1.1 cgd * regprev is used to remember where "the current char" 586 1.1 cgd * starts in the string, if due to a *+? we need to back 587 1.1 cgd * up and put the current char in a separate, 1-char, string. 588 1.1 cgd * When regprev is NULL, ch is the only char in the 589 1.1 cgd * string; this is used in *+? handling, and in setting 590 1.1 cgd * flags |= SIMPLE at the end. 591 1.1 cgd */ 592 1.1 cgd { 593 1.1 cgd char *regprev; 594 1.8 perry char ch; 595 1.1 cgd 596 1.1 cgd regparse--; /* Look at cur char */ 597 1.1 cgd ret = regnode(EXACTLY); 598 1.1 cgd for ( regprev = 0 ; ; ) { 599 1.1 cgd ch = *regparse++; /* Get current char */ 600 1.1 cgd switch (*regparse) { /* look at next one */ 601 1.1 cgd 602 1.1 cgd default: 603 1.1 cgd regc(ch); /* Add cur to string */ 604 1.1 cgd break; 605 1.1 cgd 606 1.1 cgd case '.': case '[': case '(': 607 1.1 cgd case ')': case '|': case '\n': 608 1.1 cgd case '$': case '^': 609 1.1 cgd case '\0': 610 1.1 cgd /* FIXME, $ and ^ should not always be magic */ 611 1.1 cgd magic: 612 1.1 cgd regc(ch); /* dump cur char */ 613 1.1 cgd goto done; /* and we are done */ 614 1.1 cgd 615 1.1 cgd case '?': case '+': case '*': 616 1.1 cgd if (!regprev) /* If just ch in str, */ 617 1.1 cgd goto magic; /* use it */ 618 1.1 cgd /* End mult-char string one early */ 619 1.1 cgd regparse = regprev; /* Back up parse */ 620 1.1 cgd goto done; 621 1.1 cgd 622 1.1 cgd case '\\': 623 1.1 cgd regc(ch); /* Cur char OK */ 624 1.1 cgd switch (regparse[1]){ /* Look after \ */ 625 1.1 cgd case '\0': 626 1.1 cgd case '<': 627 1.1 cgd case '>': 628 1.1 cgd /* FIXME: Someday handle \1, \2, ... */ 629 1.1 cgd goto done; /* Not quoted */ 630 1.1 cgd default: 631 1.1 cgd /* Backup point is \, scan * point is after it. */ 632 1.1 cgd regprev = regparse; 633 1.11 simonb regparse++; 634 1.1 cgd continue; /* NOT break; */ 635 1.1 cgd } 636 1.1 cgd } 637 1.1 cgd regprev = regparse; /* Set backup point */ 638 1.1 cgd } 639 1.1 cgd done: 640 1.1 cgd regc('\0'); 641 1.1 cgd *flagp |= HASWIDTH; 642 1.1 cgd if (!regprev) /* One char? */ 643 1.1 cgd *flagp |= SIMPLE; 644 1.1 cgd } 645 1.1 cgd break; 646 1.1 cgd } 647 1.1 cgd 648 1.1 cgd return(ret); 649 1.1 cgd } 650 1.1 cgd 651 1.1 cgd /* 652 1.1 cgd - regnode - emit a node 653 1.1 cgd */ 654 1.1 cgd static char * /* Location. */ 655 1.1 cgd regnode(op) 656 1.10 christos int op; 657 1.1 cgd { 658 1.8 perry char *ret; 659 1.8 perry char *ptr; 660 1.1 cgd 661 1.1 cgd ret = regcode; 662 1.1 cgd if (ret == ®dummy) { 663 1.1 cgd regsize += 3; 664 1.1 cgd return(ret); 665 1.1 cgd } 666 1.1 cgd 667 1.1 cgd ptr = ret; 668 1.1 cgd *ptr++ = op; 669 1.1 cgd *ptr++ = '\0'; /* Null "next" pointer. */ 670 1.1 cgd *ptr++ = '\0'; 671 1.1 cgd regcode = ptr; 672 1.1 cgd 673 1.1 cgd return(ret); 674 1.1 cgd } 675 1.1 cgd 676 1.1 cgd /* 677 1.1 cgd - regc - emit (if appropriate) a byte of code 678 1.1 cgd */ 679 1.1 cgd static void 680 1.1 cgd regc(b) 681 1.10 christos int b; 682 1.1 cgd { 683 1.1 cgd if (regcode != ®dummy) 684 1.1 cgd *regcode++ = b; 685 1.1 cgd else 686 1.1 cgd regsize++; 687 1.1 cgd } 688 1.1 cgd 689 1.1 cgd /* 690 1.1 cgd - reginsert - insert an operator in front of already-emitted operand 691 1.1 cgd * 692 1.1 cgd * Means relocating the operand. 693 1.1 cgd */ 694 1.1 cgd static void 695 1.1 cgd reginsert(op, opnd) 696 1.10 christos int op; 697 1.1 cgd char *opnd; 698 1.1 cgd { 699 1.8 perry char *src; 700 1.8 perry char *dst; 701 1.8 perry char *place; 702 1.1 cgd 703 1.1 cgd if (regcode == ®dummy) { 704 1.1 cgd regsize += 3; 705 1.1 cgd return; 706 1.1 cgd } 707 1.1 cgd 708 1.1 cgd src = regcode; 709 1.1 cgd regcode += 3; 710 1.1 cgd dst = regcode; 711 1.1 cgd while (src > opnd) 712 1.1 cgd *--dst = *--src; 713 1.1 cgd 714 1.1 cgd place = opnd; /* Op node, where operand used to be. */ 715 1.1 cgd *place++ = op; 716 1.1 cgd *place++ = '\0'; 717 1.1 cgd *place++ = '\0'; 718 1.1 cgd } 719 1.1 cgd 720 1.1 cgd /* 721 1.1 cgd - regtail - set the next-pointer at the end of a node chain 722 1.1 cgd */ 723 1.1 cgd static void 724 1.1 cgd regtail(p, val) 725 1.1 cgd char *p; 726 1.1 cgd char *val; 727 1.1 cgd { 728 1.8 perry char *scan; 729 1.8 perry char *temp; 730 1.8 perry int offset; 731 1.1 cgd 732 1.1 cgd if (p == ®dummy) 733 1.1 cgd return; 734 1.1 cgd 735 1.1 cgd /* Find last node. */ 736 1.1 cgd scan = p; 737 1.1 cgd for (;;) { 738 1.1 cgd temp = regnext(scan); 739 1.1 cgd if (temp == NULL) 740 1.1 cgd break; 741 1.1 cgd scan = temp; 742 1.1 cgd } 743 1.1 cgd 744 1.1 cgd if (OP(scan) == BACK) 745 1.1 cgd offset = scan - val; 746 1.1 cgd else 747 1.1 cgd offset = val - scan; 748 1.10 christos *(scan+1) = ((unsigned int)offset>>8)&0377; 749 1.1 cgd *(scan+2) = offset&0377; 750 1.1 cgd } 751 1.1 cgd 752 1.1 cgd /* 753 1.1 cgd - regoptail - regtail on operand of first argument; nop if operandless 754 1.1 cgd */ 755 1.1 cgd static void 756 1.1 cgd regoptail(p, val) 757 1.1 cgd char *p; 758 1.1 cgd char *val; 759 1.1 cgd { 760 1.1 cgd /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 761 1.1 cgd if (p == NULL || p == ®dummy || OP(p) != BRANCH) 762 1.1 cgd return; 763 1.1 cgd regtail(OPERAND(p), val); 764 1.1 cgd } 765 1.1 cgd 766 1.1 cgd /* 767 1.1 cgd * regexec and friends 768 1.1 cgd */ 769 1.1 cgd 770 1.1 cgd /* 771 1.1 cgd * Global work variables for regexec(). 772 1.1 cgd */ 773 1.1 cgd static char *reginput; /* String-input pointer. */ 774 1.1 cgd static char *regbol; /* Beginning of input, for ^ check. */ 775 1.1 cgd static char **regstartp; /* Pointer to startp array. */ 776 1.1 cgd static char **regendp; /* Ditto for endp. */ 777 1.1 cgd 778 1.1 cgd /* 779 1.1 cgd * Forwards. 780 1.1 cgd */ 781 1.4 pk STATIC int regtry __P((const regexp *, const char *)); 782 1.4 pk STATIC int regmatch __P((char *)); 783 1.4 pk STATIC int regrepeat __P((char *)); 784 1.1 cgd 785 1.1 cgd #ifdef DEBUG 786 1.1 cgd int regnarrate = 0; 787 1.4 pk void regdump __P((regexp *)); 788 1.4 pk STATIC char *regprop __P((char *)); 789 1.1 cgd #endif 790 1.1 cgd 791 1.1 cgd /* 792 1.1 cgd - regexec - match a regexp against a string 793 1.1 cgd */ 794 1.1 cgd int 795 1.9 tv __compat_regexec(prog, string) 796 1.8 perry const regexp *prog; 797 1.8 perry const char *string; 798 1.1 cgd { 799 1.8 perry char *s; 800 1.1 cgd 801 1.1 cgd /* Be paranoid... */ 802 1.1 cgd if (prog == NULL || string == NULL) { 803 1.1 cgd regerror("NULL parameter"); 804 1.1 cgd return(0); 805 1.1 cgd } 806 1.1 cgd 807 1.1 cgd /* Check validity of program. */ 808 1.1 cgd if (UCHARAT(prog->program) != MAGIC) { 809 1.1 cgd regerror("corrupted program"); 810 1.1 cgd return(0); 811 1.1 cgd } 812 1.1 cgd 813 1.1 cgd /* If there is a "must appear" string, look for it. */ 814 1.1 cgd if (prog->regmust != NULL) { 815 1.10 christos /* LINTED const castaway */ 816 1.1 cgd s = (char *)string; 817 1.1 cgd while ((s = strchr(s, prog->regmust[0])) != NULL) { 818 1.10 christos if (strncmp(s, prog->regmust, 819 1.10 christos (size_t)prog->regmlen) == 0) 820 1.1 cgd break; /* Found it. */ 821 1.1 cgd s++; 822 1.1 cgd } 823 1.1 cgd if (s == NULL) /* Not present. */ 824 1.1 cgd return(0); 825 1.1 cgd } 826 1.1 cgd 827 1.1 cgd /* Mark beginning of line for ^ . */ 828 1.10 christos /* LINTED const castaway */ 829 1.1 cgd regbol = (char *)string; 830 1.1 cgd 831 1.1 cgd /* Simplest case: anchored match need be tried only once. */ 832 1.1 cgd if (prog->reganch) 833 1.1 cgd return(regtry(prog, string)); 834 1.1 cgd 835 1.1 cgd /* Messy cases: unanchored match. */ 836 1.10 christos /* LINTED const castaway */ 837 1.1 cgd s = (char *)string; 838 1.1 cgd if (prog->regstart != '\0') 839 1.1 cgd /* We know what char it must start with. */ 840 1.1 cgd while ((s = strchr(s, prog->regstart)) != NULL) { 841 1.1 cgd if (regtry(prog, s)) 842 1.1 cgd return(1); 843 1.1 cgd s++; 844 1.1 cgd } 845 1.1 cgd else 846 1.1 cgd /* We don't -- general case. */ 847 1.1 cgd do { 848 1.1 cgd if (regtry(prog, s)) 849 1.1 cgd return(1); 850 1.1 cgd } while (*s++ != '\0'); 851 1.1 cgd 852 1.1 cgd /* Failure. */ 853 1.1 cgd return(0); 854 1.1 cgd } 855 1.1 cgd 856 1.1 cgd /* 857 1.1 cgd - regtry - try match at specific point 858 1.1 cgd */ 859 1.1 cgd static int /* 0 failure, 1 success */ 860 1.1 cgd regtry(prog, string) 861 1.4 pk const regexp *prog; 862 1.4 pk const char *string; 863 1.1 cgd { 864 1.8 perry int i; 865 1.8 perry char **sp; 866 1.8 perry char **ep; 867 1.1 cgd 868 1.10 christos /* LINTED const castaway */ 869 1.5 cgd reginput = (char *)string; /* XXX */ 870 1.5 cgd regstartp = (char **)prog->startp; /* XXX */ 871 1.5 cgd regendp = (char **)prog->endp; /* XXX */ 872 1.1 cgd 873 1.5 cgd sp = (char **)prog->startp; /* XXX */ 874 1.5 cgd ep = (char **)prog->endp; /* XXX */ 875 1.1 cgd for (i = NSUBEXP; i > 0; i--) { 876 1.1 cgd *sp++ = NULL; 877 1.1 cgd *ep++ = NULL; 878 1.1 cgd } 879 1.5 cgd if (regmatch((char *)prog->program + 1)) { /* XXX */ 880 1.10 christos /* LINTED const castaway */ 881 1.5 cgd ((regexp *)prog)->startp[0] = (char *)string; /* XXX */ 882 1.10 christos /* LINTED const castaway */ 883 1.5 cgd ((regexp *)prog)->endp[0] = reginput; /* XXX */ 884 1.1 cgd return(1); 885 1.1 cgd } else 886 1.1 cgd return(0); 887 1.1 cgd } 888 1.1 cgd 889 1.1 cgd /* 890 1.1 cgd - regmatch - main matching routine 891 1.1 cgd * 892 1.1 cgd * Conceptually the strategy is simple: check to see whether the current 893 1.1 cgd * node matches, call self recursively to see whether the rest matches, 894 1.1 cgd * and then act accordingly. In practice we make some effort to avoid 895 1.1 cgd * recursion, in particular by going through "ordinary" nodes (that don't 896 1.1 cgd * need to know whether the rest of the match failed) by a loop instead of 897 1.1 cgd * by recursion. 898 1.1 cgd */ 899 1.1 cgd static int /* 0 failure, 1 success */ 900 1.1 cgd regmatch(prog) 901 1.1 cgd char *prog; 902 1.1 cgd { 903 1.8 perry char *scan; /* Current node. */ 904 1.1 cgd char *next; /* Next node. */ 905 1.1 cgd 906 1.1 cgd scan = prog; 907 1.1 cgd #ifdef DEBUG 908 1.1 cgd if (scan != NULL && regnarrate) 909 1.1 cgd fprintf(stderr, "%s(\n", regprop(scan)); 910 1.1 cgd #endif 911 1.1 cgd while (scan != NULL) { 912 1.1 cgd #ifdef DEBUG 913 1.1 cgd if (regnarrate) 914 1.1 cgd fprintf(stderr, "%s...\n", regprop(scan)); 915 1.1 cgd #endif 916 1.1 cgd next = regnext(scan); 917 1.1 cgd 918 1.1 cgd switch (OP(scan)) { 919 1.1 cgd case BOL: 920 1.1 cgd if (reginput != regbol) 921 1.1 cgd return(0); 922 1.1 cgd break; 923 1.1 cgd case EOL: 924 1.1 cgd if (*reginput != '\0') 925 1.1 cgd return(0); 926 1.1 cgd break; 927 1.1 cgd case WORDA: 928 1.1 cgd /* Must be looking at a letter, digit, or _ */ 929 1.13 itohy if ((!isalnum(UCHARAT(reginput))) && *reginput != '_') 930 1.1 cgd return(0); 931 1.1 cgd /* Prev must be BOL or nonword */ 932 1.1 cgd if (reginput > regbol && 933 1.13 itohy (isalnum(UCHARAT(reginput - 1)) 934 1.13 itohy || reginput[-1] == '_')) 935 1.1 cgd return(0); 936 1.1 cgd break; 937 1.1 cgd case WORDZ: 938 1.1 cgd /* Must be looking at non letter, digit, or _ */ 939 1.13 itohy if (isalnum(UCHARAT(reginput)) || *reginput == '_') 940 1.1 cgd return(0); 941 1.1 cgd /* We don't care what the previous char was */ 942 1.1 cgd break; 943 1.1 cgd case ANY: 944 1.1 cgd if (*reginput == '\0') 945 1.1 cgd return(0); 946 1.1 cgd reginput++; 947 1.1 cgd break; 948 1.1 cgd case EXACTLY: { 949 1.8 perry int len; 950 1.8 perry char *opnd; 951 1.1 cgd 952 1.1 cgd opnd = OPERAND(scan); 953 1.1 cgd /* Inline the first character, for speed. */ 954 1.1 cgd if (*opnd != *reginput) 955 1.1 cgd return(0); 956 1.1 cgd len = strlen(opnd); 957 1.11 simonb if (len > 1 && strncmp(opnd, reginput, 958 1.10 christos (size_t)len) != 0) 959 1.1 cgd return(0); 960 1.1 cgd reginput += len; 961 1.1 cgd } 962 1.1 cgd break; 963 1.1 cgd case ANYOF: 964 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 965 1.1 cgd return(0); 966 1.1 cgd reginput++; 967 1.1 cgd break; 968 1.1 cgd case ANYBUT: 969 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 970 1.1 cgd return(0); 971 1.1 cgd reginput++; 972 1.1 cgd break; 973 1.1 cgd case NOTHING: 974 1.1 cgd break; 975 1.1 cgd case BACK: 976 1.1 cgd break; 977 1.1 cgd case OPEN+1: 978 1.1 cgd case OPEN+2: 979 1.1 cgd case OPEN+3: 980 1.1 cgd case OPEN+4: 981 1.1 cgd case OPEN+5: 982 1.1 cgd case OPEN+6: 983 1.1 cgd case OPEN+7: 984 1.1 cgd case OPEN+8: 985 1.1 cgd case OPEN+9: { 986 1.8 perry int no; 987 1.8 perry char *save; 988 1.1 cgd 989 1.1 cgd no = OP(scan) - OPEN; 990 1.1 cgd save = reginput; 991 1.1 cgd 992 1.1 cgd if (regmatch(next)) { 993 1.1 cgd /* 994 1.1 cgd * Don't set startp if some later 995 1.1 cgd * invocation of the same parentheses 996 1.1 cgd * already has. 997 1.1 cgd */ 998 1.1 cgd if (regstartp[no] == NULL) 999 1.1 cgd regstartp[no] = save; 1000 1.1 cgd return(1); 1001 1.1 cgd } else 1002 1.1 cgd return(0); 1003 1.1 cgd } 1004 1.1 cgd case CLOSE+1: 1005 1.1 cgd case CLOSE+2: 1006 1.1 cgd case CLOSE+3: 1007 1.1 cgd case CLOSE+4: 1008 1.1 cgd case CLOSE+5: 1009 1.1 cgd case CLOSE+6: 1010 1.1 cgd case CLOSE+7: 1011 1.1 cgd case CLOSE+8: 1012 1.1 cgd case CLOSE+9: { 1013 1.8 perry int no; 1014 1.8 perry char *save; 1015 1.1 cgd 1016 1.1 cgd no = OP(scan) - CLOSE; 1017 1.1 cgd save = reginput; 1018 1.1 cgd 1019 1.1 cgd if (regmatch(next)) { 1020 1.1 cgd /* 1021 1.1 cgd * Don't set endp if some later 1022 1.1 cgd * invocation of the same parentheses 1023 1.1 cgd * already has. 1024 1.1 cgd */ 1025 1.1 cgd if (regendp[no] == NULL) 1026 1.1 cgd regendp[no] = save; 1027 1.1 cgd return(1); 1028 1.1 cgd } else 1029 1.1 cgd return(0); 1030 1.1 cgd } 1031 1.1 cgd case BRANCH: { 1032 1.8 perry char *save; 1033 1.1 cgd 1034 1.1 cgd if (OP(next) != BRANCH) /* No choice. */ 1035 1.1 cgd next = OPERAND(scan); /* Avoid recursion. */ 1036 1.1 cgd else { 1037 1.1 cgd do { 1038 1.1 cgd save = reginput; 1039 1.1 cgd if (regmatch(OPERAND(scan))) 1040 1.1 cgd return(1); 1041 1.1 cgd reginput = save; 1042 1.1 cgd scan = regnext(scan); 1043 1.1 cgd } while (scan != NULL && OP(scan) == BRANCH); 1044 1.1 cgd return(0); 1045 1.1 cgd /* NOTREACHED */ 1046 1.1 cgd } 1047 1.1 cgd } 1048 1.1 cgd break; 1049 1.1 cgd case STAR: 1050 1.1 cgd case PLUS: { 1051 1.8 perry char nextch; 1052 1.8 perry int no; 1053 1.8 perry char *save; 1054 1.8 perry int min; 1055 1.1 cgd 1056 1.1 cgd /* 1057 1.1 cgd * Lookahead to avoid useless match attempts 1058 1.1 cgd * when we know what character comes next. 1059 1.1 cgd */ 1060 1.1 cgd nextch = '\0'; 1061 1.1 cgd if (OP(next) == EXACTLY) 1062 1.1 cgd nextch = *OPERAND(next); 1063 1.1 cgd min = (OP(scan) == STAR) ? 0 : 1; 1064 1.1 cgd save = reginput; 1065 1.1 cgd no = regrepeat(OPERAND(scan)); 1066 1.1 cgd while (no >= min) { 1067 1.1 cgd /* If it could work, try it. */ 1068 1.1 cgd if (nextch == '\0' || *reginput == nextch) 1069 1.1 cgd if (regmatch(next)) 1070 1.1 cgd return(1); 1071 1.1 cgd /* Couldn't or didn't -- back up. */ 1072 1.1 cgd no--; 1073 1.1 cgd reginput = save + no; 1074 1.1 cgd } 1075 1.1 cgd return(0); 1076 1.1 cgd } 1077 1.1 cgd case END: 1078 1.1 cgd return(1); /* Success! */ 1079 1.1 cgd default: 1080 1.1 cgd regerror("memory corruption"); 1081 1.1 cgd return(0); 1082 1.1 cgd } 1083 1.1 cgd 1084 1.1 cgd scan = next; 1085 1.1 cgd } 1086 1.1 cgd 1087 1.1 cgd /* 1088 1.1 cgd * We get here only if there's trouble -- normally "case END" is 1089 1.1 cgd * the terminating point. 1090 1.1 cgd */ 1091 1.1 cgd regerror("corrupted pointers"); 1092 1.1 cgd return(0); 1093 1.1 cgd } 1094 1.1 cgd 1095 1.1 cgd /* 1096 1.1 cgd - regrepeat - repeatedly match something simple, report how many 1097 1.1 cgd */ 1098 1.1 cgd static int 1099 1.1 cgd regrepeat(p) 1100 1.1 cgd char *p; 1101 1.1 cgd { 1102 1.8 perry int count = 0; 1103 1.8 perry char *scan; 1104 1.8 perry char *opnd; 1105 1.1 cgd 1106 1.1 cgd scan = reginput; 1107 1.1 cgd opnd = OPERAND(p); 1108 1.1 cgd switch (OP(p)) { 1109 1.1 cgd case ANY: 1110 1.1 cgd count = strlen(scan); 1111 1.1 cgd scan += count; 1112 1.1 cgd break; 1113 1.1 cgd case EXACTLY: 1114 1.1 cgd while (*opnd == *scan) { 1115 1.1 cgd count++; 1116 1.1 cgd scan++; 1117 1.1 cgd } 1118 1.1 cgd break; 1119 1.1 cgd case ANYOF: 1120 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1121 1.1 cgd count++; 1122 1.1 cgd scan++; 1123 1.1 cgd } 1124 1.1 cgd break; 1125 1.1 cgd case ANYBUT: 1126 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1127 1.1 cgd count++; 1128 1.1 cgd scan++; 1129 1.1 cgd } 1130 1.1 cgd break; 1131 1.1 cgd default: /* Oh dear. Called inappropriately. */ 1132 1.1 cgd regerror("internal foulup"); 1133 1.1 cgd count = 0; /* Best compromise. */ 1134 1.1 cgd break; 1135 1.1 cgd } 1136 1.1 cgd reginput = scan; 1137 1.1 cgd 1138 1.1 cgd return(count); 1139 1.1 cgd } 1140 1.1 cgd 1141 1.1 cgd /* 1142 1.1 cgd - regnext - dig the "next" pointer out of a node 1143 1.1 cgd */ 1144 1.1 cgd static char * 1145 1.1 cgd regnext(p) 1146 1.8 perry char *p; 1147 1.1 cgd { 1148 1.8 perry int offset; 1149 1.1 cgd 1150 1.1 cgd if (p == ®dummy) 1151 1.1 cgd return(NULL); 1152 1.1 cgd 1153 1.1 cgd offset = NEXT(p); 1154 1.1 cgd if (offset == 0) 1155 1.1 cgd return(NULL); 1156 1.1 cgd 1157 1.1 cgd if (OP(p) == BACK) 1158 1.1 cgd return(p-offset); 1159 1.1 cgd else 1160 1.1 cgd return(p+offset); 1161 1.1 cgd } 1162 1.1 cgd 1163 1.1 cgd #ifdef DEBUG 1164 1.1 cgd 1165 1.1 cgd /* 1166 1.1 cgd - regdump - dump a regexp onto stdout in vaguely comprehensible form 1167 1.1 cgd */ 1168 1.1 cgd void 1169 1.1 cgd regdump(r) 1170 1.1 cgd regexp *r; 1171 1.1 cgd { 1172 1.8 perry char *s; 1173 1.8 perry char op = EXACTLY; /* Arbitrary non-END op. */ 1174 1.8 perry char *next; 1175 1.1 cgd 1176 1.1 cgd 1177 1.1 cgd s = r->program + 1; 1178 1.1 cgd while (op != END) { /* While that wasn't END last time... */ 1179 1.1 cgd op = OP(s); 1180 1.18 freza printf("%2td%s", s-r->program, regprop(s)); /* Where, what. */ 1181 1.1 cgd next = regnext(s); 1182 1.1 cgd if (next == NULL) /* Next ptr. */ 1183 1.1 cgd printf("(0)"); 1184 1.11 simonb else 1185 1.18 freza printf("(%td)", (s-r->program)+(next-s)); 1186 1.1 cgd s += 3; 1187 1.1 cgd if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1188 1.1 cgd /* Literal string, where present. */ 1189 1.1 cgd while (*s != '\0') { 1190 1.1 cgd putchar(*s); 1191 1.1 cgd s++; 1192 1.1 cgd } 1193 1.1 cgd s++; 1194 1.1 cgd } 1195 1.1 cgd putchar('\n'); 1196 1.1 cgd } 1197 1.1 cgd 1198 1.1 cgd /* Header fields of interest. */ 1199 1.1 cgd if (r->regstart != '\0') 1200 1.1 cgd printf("start `%c' ", r->regstart); 1201 1.1 cgd if (r->reganch) 1202 1.1 cgd printf("anchored "); 1203 1.1 cgd if (r->regmust != NULL) 1204 1.1 cgd printf("must have \"%s\"", r->regmust); 1205 1.1 cgd printf("\n"); 1206 1.1 cgd } 1207 1.1 cgd 1208 1.1 cgd /* 1209 1.1 cgd - regprop - printable representation of opcode 1210 1.1 cgd */ 1211 1.1 cgd static char * 1212 1.1 cgd regprop(op) 1213 1.1 cgd char *op; 1214 1.1 cgd { 1215 1.8 perry char *p; 1216 1.1 cgd static char buf[50]; 1217 1.1 cgd 1218 1.6 mrg (void)strncpy(buf, ":", sizeof(buf) - 1); 1219 1.1 cgd 1220 1.1 cgd switch (OP(op)) { 1221 1.1 cgd case BOL: 1222 1.1 cgd p = "BOL"; 1223 1.1 cgd break; 1224 1.1 cgd case EOL: 1225 1.1 cgd p = "EOL"; 1226 1.1 cgd break; 1227 1.1 cgd case ANY: 1228 1.1 cgd p = "ANY"; 1229 1.1 cgd break; 1230 1.1 cgd case ANYOF: 1231 1.1 cgd p = "ANYOF"; 1232 1.1 cgd break; 1233 1.1 cgd case ANYBUT: 1234 1.1 cgd p = "ANYBUT"; 1235 1.1 cgd break; 1236 1.1 cgd case BRANCH: 1237 1.1 cgd p = "BRANCH"; 1238 1.1 cgd break; 1239 1.1 cgd case EXACTLY: 1240 1.1 cgd p = "EXACTLY"; 1241 1.1 cgd break; 1242 1.1 cgd case NOTHING: 1243 1.1 cgd p = "NOTHING"; 1244 1.1 cgd break; 1245 1.1 cgd case BACK: 1246 1.1 cgd p = "BACK"; 1247 1.1 cgd break; 1248 1.1 cgd case END: 1249 1.1 cgd p = "END"; 1250 1.1 cgd break; 1251 1.1 cgd case OPEN+1: 1252 1.1 cgd case OPEN+2: 1253 1.1 cgd case OPEN+3: 1254 1.1 cgd case OPEN+4: 1255 1.1 cgd case OPEN+5: 1256 1.1 cgd case OPEN+6: 1257 1.1 cgd case OPEN+7: 1258 1.1 cgd case OPEN+8: 1259 1.1 cgd case OPEN+9: 1260 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1261 1.6 mrg "OPEN%d", OP(op)-OPEN); 1262 1.1 cgd p = NULL; 1263 1.1 cgd break; 1264 1.1 cgd case CLOSE+1: 1265 1.1 cgd case CLOSE+2: 1266 1.1 cgd case CLOSE+3: 1267 1.1 cgd case CLOSE+4: 1268 1.1 cgd case CLOSE+5: 1269 1.1 cgd case CLOSE+6: 1270 1.1 cgd case CLOSE+7: 1271 1.1 cgd case CLOSE+8: 1272 1.1 cgd case CLOSE+9: 1273 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf), 1274 1.6 mrg "CLOSE%d", OP(op)-CLOSE); 1275 1.1 cgd p = NULL; 1276 1.1 cgd break; 1277 1.1 cgd case STAR: 1278 1.1 cgd p = "STAR"; 1279 1.1 cgd break; 1280 1.1 cgd case PLUS: 1281 1.1 cgd p = "PLUS"; 1282 1.1 cgd break; 1283 1.1 cgd case WORDA: 1284 1.1 cgd p = "WORDA"; 1285 1.1 cgd break; 1286 1.1 cgd case WORDZ: 1287 1.1 cgd p = "WORDZ"; 1288 1.1 cgd break; 1289 1.1 cgd default: 1290 1.17 christos p = NULL; 1291 1.1 cgd regerror("corrupted opcode"); 1292 1.1 cgd break; 1293 1.1 cgd } 1294 1.1 cgd if (p != NULL) 1295 1.6 mrg (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1); 1296 1.1 cgd return(buf); 1297 1.1 cgd } 1298 1.1 cgd #endif 1299 1.1 cgd 1300 1.1 cgd /* 1301 1.1 cgd * The following is provided for those people who do not have strcspn() in 1302 1.1 cgd * their C libraries. They should get off their butts and do something 1303 1.1 cgd * about it; at least one public-domain implementation of those (highly 1304 1.1 cgd * useful) string routines has been published on Usenet. 1305 1.1 cgd */ 1306 1.1 cgd #ifdef STRCSPN 1307 1.1 cgd /* 1308 1.1 cgd * strcspn - find length of initial segment of s1 consisting entirely 1309 1.1 cgd * of characters not from s2 1310 1.1 cgd */ 1311 1.1 cgd 1312 1.1 cgd static int 1313 1.1 cgd strcspn(s1, s2) 1314 1.1 cgd char *s1; 1315 1.1 cgd char *s2; 1316 1.1 cgd { 1317 1.8 perry char *scan1; 1318 1.8 perry char *scan2; 1319 1.8 perry int count; 1320 1.1 cgd 1321 1.1 cgd count = 0; 1322 1.1 cgd for (scan1 = s1; *scan1 != '\0'; scan1++) { 1323 1.1 cgd for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1324 1.1 cgd if (*scan1 == *scan2++) 1325 1.1 cgd return(count); 1326 1.1 cgd count++; 1327 1.1 cgd } 1328 1.1 cgd return(count); 1329 1.1 cgd } 1330 1.1 cgd #endif 1331