Home | History | Annotate | Line # | Download | only in dist
      1 /*	$NetBSD: regexp.c,v 1.4 2023/10/06 05:49:49 simonb Exp $	*/
      2 
      3 /*
      4  * regcomp and regexec -- regsub and regerror are elsewhere
      5  *
      6  *	Copyright (c) 1986 by University of Toronto.
      7  *	Written by Henry Spencer.  Not derived from licensed software.
      8  *
      9  *	Permission is granted to anyone to use this software for any
     10  *	purpose on any computer system, and to redistribute it freely,
     11  *	subject to the following restrictions:
     12  *
     13  *	1. The author is not responsible for the consequences of use of
     14  *		this software, no matter how awful, even if they arise
     15  *		from defects in it.
     16  *
     17  *	2. The origin of this software must not be misrepresented, either
     18  *		by explicit claim or by omission.
     19  *
     20  *	3. Altered versions must be plainly marked as such, and must not
     21  *		be misrepresented as being the original software.
     22  *
     23  * Beware that some of this code is subtly aware of the way operator
     24  * precedence is structured in regular expressions.  Serious changes in
     25  * regular-expression syntax might require a total rethink.
     26  *
     27  * *** NOTE: this code has been altered slightly for use in Tcl. ***
     28  * Slightly modified by David MacKenzie to undo most of the changes for TCL.
     29  * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
     30  */
     31 
     32 #include "less.h"
     33 #if HAVE_STDIO_H
     34 #include <stdio.h>
     35 #endif
     36 #if HAVE_STDLIB_H
     37 #include <stdlib.h>
     38 #endif
     39 #if HAVE_STRING_H
     40 #include <string.h>
     41 #endif
     42 #include "regexp.h"
     43 
     44 /*
     45  * The "internal use only" fields in regexp.h are present to pass info from
     46  * compile to execute that permits the execute phase to run lots faster on
     47  * simple cases.  They are:
     48  *
     49  * regstart	char that must begin a match; '\0' if none obvious
     50  * reganch	is the match anchored (at beginning-of-line only)?
     51  * regmust	string (pointer into program) that match must include, or NULL
     52  * regmlen	length of regmust string
     53  *
     54  * Regstart and reganch permit very fast decisions on suitable starting points
     55  * for a match, cutting down the work a lot.  Regmust permits fast rejection
     56  * of lines that cannot possibly match.  The regmust tests are costly enough
     57  * that regcomp() supplies a regmust only if the r.e. contains something
     58  * potentially expensive (at present, the only such thing detected is * or +
     59  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
     60  * supplied because the test in regexec() needs it and regcomp() is
     61  * computing it anyway.
     62  */
     63 
     64 /*
     65  * Structure for regexp "program".  This is essentially a linear encoding
     66  * of a nondeterministic finite-state machine (aka syntax charts or
     67  * "railroad normal form" in parsing technology).  Each node is an opcode
     68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
     69  * all nodes except BRANCH implement concatenation; a "next" pointer with
     70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
     71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
     72  * opposed to a collection of them) is never concatenated with anything
     73  * because of operator precedence.)  The operand of some types of node is
     74  * a literal string; for others, it is a node leading into a sub-FSM.  In
     75  * particular, the operand of a BRANCH node is the first node of the branch.
     76  * (NB this is *not* a tree structure:  the tail of the branch connects
     77  * to the thing following the set of BRANCHes.)  The opcodes are:
     78  */
     79 
     80 /* definition	number	opnd?	meaning */
     81 #undef EOL
     82 #define	END	0	/* no	End of program. */
     83 #define	BOL	1	/* no	Match "" at beginning of line. */
     84 #define	EOL	2	/* no	Match "" at end of line. */
     85 #define	ANY	3	/* no	Match any one character. */
     86 #define	ANYOF	4	/* str	Match any character in this string. */
     87 #define	ANYBUT	5	/* str	Match any character not in this string. */
     88 #define	BRANCH	6	/* node	Match this alternative, or the next... */
     89 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
     90 #define	EXACTLY	8	/* str	Match this string. */
     91 #define	NOTHING	9	/* no	Match empty string. */
     92 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
     93 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
     94 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
     95 			/*	OPEN+1 is number 1, etc. */
     96 #define	CLOSE	30	/* no	Analogous to OPEN. */
     97 
     98 /*
     99  * Opcode notes:
    100  *
    101  * BRANCH	The set of branches constituting a single choice are hooked
    102  *		together with their "next" pointers, since precedence prevents
    103  *		anything being concatenated to any individual branch.  The
    104  *		"next" pointer of the last BRANCH in a choice points to the
    105  *		thing following the whole choice.  This is also where the
    106  *		final "next" pointer of each individual branch points; each
    107  *		branch starts with the operand node of a BRANCH node.
    108  *
    109  * BACK		Normal "next" pointers all implicitly point forward; BACK
    110  *		exists to make loop structures possible.
    111  *
    112  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
    113  *		BRANCH structures using BACK.  Simple cases (one character
    114  *		per match) are implemented with STAR and PLUS for speed
    115  *		and to minimize recursive plunges.
    116  *
    117  * OPEN,CLOSE	...are numbered at compile time.
    118  */
    119 
    120 /*
    121  * A node is one char of opcode followed by two chars of "next" pointer.
    122  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
    123  * value is a positive offset from the opcode of the node containing it.
    124  * An operand, if any, simply follows the node.  (Note that much of the
    125  * code generation knows about this implicit relationship.)
    126  *
    127  * Using two bytes for the "next" pointer is vast overkill for most things,
    128  * but allows patterns to get big without disasters.
    129  */
    130 #define	OP(p)	(*(p))
    131 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
    132 #define	OPERAND(p)	((p) + 3)
    133 
    134 /*
    135  * See regmagic.h for one further detail of program structure.
    136  */
    137 
    138 
    139 /*
    140  * Utility definitions.
    141  */
    142 #ifndef CHARBITS
    143 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
    144 #else
    145 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
    146 #endif
    147 
    148 #define	FAIL(m)	{ regerror(m); return(NULL); }
    149 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
    150 #define	META	"^$.[()|?+*\\"
    151 
    152 /*
    153  * Flags to be passed up and down.
    154  */
    155 #define	HASWIDTH	01	/* Known never to match null string. */
    156 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
    157 #define	SPSTART		04	/* Starts with * or +. */
    158 #define	WORST		0	/* Worst case. */
    159 
    160 /*
    161  * Global work variables for regcomp().
    162  */
    163 static char *regparse;		/* Input-scan pointer. */
    164 static int regnpar;		/* () count. */
    165 static char regdummy;
    166 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
    167 static long regsize;		/* Code size. */
    168 
    169 /*
    170  * The first byte of the regexp internal "program" is actually this magic
    171  * number; the start node begins in the second byte.
    172  */
    173 #define	MAGIC	0234
    174 
    175 
    176 /*
    177  * Forward declarations for regcomp()'s friends.
    178  */
    179 #ifndef STATIC
    180 #define	STATIC	static
    181 #endif
    182 STATIC char *reg();
    183 STATIC char *regbranch();
    184 STATIC char *regpiece();
    185 STATIC char *regatom();
    186 STATIC char *regnode();
    187 STATIC char *regnext();
    188 STATIC void regc();
    189 STATIC void reginsert();
    190 STATIC void regtail();
    191 STATIC void regoptail();
    192 #ifdef STRCSPN
    193 STATIC int strcspn();
    194 #endif
    195 
    196 /*
    197  - regcomp - compile a regular expression into internal code
    198  *
    199  * We can't allocate space until we know how big the compiled form will be,
    200  * but we can't compile it (and thus know how big it is) until we've got a
    201  * place to put the code.  So we cheat:  we compile it twice, once with code
    202  * generation turned off and size counting turned on, and once "for real".
    203  * This also means that we don't allocate space until we are sure that the
    204  * thing really will compile successfully, and we never have to move the
    205  * code and thus invalidate pointers into it.  (Note that it has to be in
    206  * one piece because free() must be able to free it all.)
    207  *
    208  * Beware that the optimization-preparation code in here knows about some
    209  * of the structure of the compiled regexp.
    210  */
    211 regexp *
    212 regcomp(exp)
    213 char *exp;
    214 {
    215 	register regexp *r;
    216 	register char *scan;
    217 	register char *longest;
    218 	register int len;
    219 	int flags;
    220 
    221 	if (exp == NULL)
    222 		FAIL("NULL argument");
    223 
    224 	/* First pass: determine size, legality. */
    225 	regparse = exp;
    226 	regnpar = 1;
    227 	regsize = 0L;
    228 	regcode = &regdummy;
    229 	regc(MAGIC);
    230 	if (reg(0, &flags) == NULL)
    231 		return(NULL);
    232 
    233 	/* Small enough for pointer-storage convention? */
    234 	if (regsize >= 32767L)		/* Probably could be 65535L. */
    235 		FAIL("regexp too big");
    236 
    237 	/* Allocate space. */
    238 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
    239 	if (r == NULL)
    240 		FAIL("out of space");
    241 
    242 	/* Second pass: emit code. */
    243 	regparse = exp;
    244 	regnpar = 1;
    245 	regcode = r->program;
    246 	regc(MAGIC);
    247 	if (reg(0, &flags) == NULL)
    248 	{
    249 		free(r);
    250 		return(NULL);
    251 	}
    252 
    253 	/* Dig out information for optimizations. */
    254 	r->regstart = '\0';	/* Worst-case defaults. */
    255 	r->reganch = 0;
    256 	r->regmust = NULL;
    257 	r->regmlen = 0;
    258 	scan = r->program+1;			/* First BRANCH. */
    259 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
    260 		scan = OPERAND(scan);
    261 
    262 		/* Starting-point info. */
    263 		if (OP(scan) == EXACTLY)
    264 			r->regstart = *OPERAND(scan);
    265 		else if (OP(scan) == BOL)
    266 			r->reganch++;
    267 
    268 		/*
    269 		 * If there's something expensive in the r.e., find the
    270 		 * longest literal string that must appear and make it the
    271 		 * regmust.  Resolve ties in favor of later strings, since
    272 		 * the regstart check works with the beginning of the r.e.
    273 		 * and avoiding duplication strengthens checking.  Not a
    274 		 * strong reason, but sufficient in the absence of others.
    275 		 */
    276 		if (flags&SPSTART) {
    277 			longest = NULL;
    278 			len = 0;
    279 			for (; scan != NULL; scan = regnext(scan))
    280 				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
    281 					longest = OPERAND(scan);
    282 					len = (int) strlen(OPERAND(scan));
    283 				}
    284 			r->regmust = longest;
    285 			r->regmlen = len;
    286 		}
    287 	}
    288 
    289 	return(r);
    290 }
    291 
    292 /*
    293  - reg - regular expression, i.e. main body or parenthesized thing
    294  *
    295  * Caller must absorb opening parenthesis.
    296  *
    297  * Combining parenthesis handling with the base level of regular expression
    298  * is a trifle forced, but the need to tie the tails of the branches to what
    299  * follows makes it hard to avoid.
    300  */
    301 static char *
    302 reg(paren, flagp)
    303 int paren;			/* Parenthesized? */
    304 int *flagp;
    305 {
    306 	register char *ret;
    307 	register char *br;
    308 	register char *ender;
    309 	register int parno = 0;
    310 	int flags;
    311 
    312 	*flagp = HASWIDTH;	/* Tentatively. */
    313 
    314 	/* Make an OPEN node, if parenthesized. */
    315 	if (paren) {
    316 		if (regnpar >= NSUBEXP)
    317 			FAIL("too many ()");
    318 		parno = regnpar;
    319 		regnpar++;
    320 		ret = regnode(OPEN+parno);
    321 	} else
    322 		ret = NULL;
    323 
    324 	/* Pick up the branches, linking them together. */
    325 	br = regbranch(&flags);
    326 	if (br == NULL)
    327 		return(NULL);
    328 	if (ret != NULL)
    329 		regtail(ret, br);	/* OPEN -> first. */
    330 	else
    331 		ret = br;
    332 	if (!(flags&HASWIDTH))
    333 		*flagp &= ~HASWIDTH;
    334 	*flagp |= flags&SPSTART;
    335 	while (*regparse == '|') {
    336 		regparse++;
    337 		br = regbranch(&flags);
    338 		if (br == NULL)
    339 			return(NULL);
    340 		regtail(ret, br);	/* BRANCH -> BRANCH. */
    341 		if (!(flags&HASWIDTH))
    342 			*flagp &= ~HASWIDTH;
    343 		*flagp |= flags&SPSTART;
    344 	}
    345 
    346 	/* Make a closing node, and hook it on the end. */
    347 	ender = regnode((paren) ? CLOSE+parno : END);
    348 	regtail(ret, ender);
    349 
    350 	/* Hook the tails of the branches to the closing node. */
    351 	for (br = ret; br != NULL; br = regnext(br))
    352 		regoptail(br, ender);
    353 
    354 	/* Check for proper termination. */
    355 	if (paren && *regparse++ != ')') {
    356 		FAIL("unmatched ()");
    357 	} else if (!paren && *regparse != '\0') {
    358 		if (*regparse == ')') {
    359 			FAIL("unmatched ()");
    360 		} else
    361 			FAIL("junk on end");	/* "Can't happen". */
    362 		/* NOTREACHED */
    363 	}
    364 
    365 	return(ret);
    366 }
    367 
    368 /*
    369  - regbranch - one alternative of an | operator
    370  *
    371  * Implements the concatenation operator.
    372  */
    373 static char *
    374 regbranch(flagp)
    375 int *flagp;
    376 {
    377 	register char *ret;
    378 	register char *chain;
    379 	register char *latest;
    380 	int flags;
    381 
    382 	*flagp = WORST;		/* Tentatively. */
    383 
    384 	ret = regnode(BRANCH);
    385 	chain = NULL;
    386 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
    387 		latest = regpiece(&flags);
    388 		if (latest == NULL)
    389 			return(NULL);
    390 		*flagp |= flags&HASWIDTH;
    391 		if (chain == NULL)	/* First piece. */
    392 			*flagp |= flags&SPSTART;
    393 		else
    394 			regtail(chain, latest);
    395 		chain = latest;
    396 	}
    397 	if (chain == NULL)	/* Loop ran zero times. */
    398 		(void) regnode(NOTHING);
    399 
    400 	return(ret);
    401 }
    402 
    403 /*
    404  - regpiece - something followed by possible [*+?]
    405  *
    406  * Note that the branching code sequences used for ? and the general cases
    407  * of * and + are somewhat optimized:  they use the same NOTHING node as
    408  * both the endmarker for their branch list and the body of the last branch.
    409  * It might seem that this node could be dispensed with entirely, but the
    410  * endmarker role is not redundant.
    411  */
    412 static char *
    413 regpiece(flagp)
    414 int *flagp;
    415 {
    416 	register char *ret;
    417 	register char op;
    418 	register char *next;
    419 	int flags;
    420 
    421 	ret = regatom(&flags);
    422 	if (ret == NULL)
    423 		return(NULL);
    424 
    425 	op = *regparse;
    426 	if (!ISMULT(op)) {
    427 		*flagp = flags;
    428 		return(ret);
    429 	}
    430 
    431 	if (!(flags&HASWIDTH) && op != '?')
    432 		FAIL("*+ operand could be empty");
    433 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
    434 
    435 	if (op == '*' && (flags&SIMPLE))
    436 		reginsert(STAR, ret);
    437 	else if (op == '*') {
    438 		/* Emit x* as (x&|), where & means "self". */
    439 		reginsert(BRANCH, ret);			/* Either x */
    440 		regoptail(ret, regnode(BACK));		/* and loop */
    441 		regoptail(ret, ret);			/* back */
    442 		regtail(ret, regnode(BRANCH));		/* or */
    443 		regtail(ret, regnode(NOTHING));		/* null. */
    444 	} else if (op == '+' && (flags&SIMPLE))
    445 		reginsert(PLUS, ret);
    446 	else if (op == '+') {
    447 		/* Emit x+ as x(&|), where & means "self". */
    448 		next = regnode(BRANCH);			/* Either */
    449 		regtail(ret, next);
    450 		regtail(regnode(BACK), ret);		/* loop back */
    451 		regtail(next, regnode(BRANCH));		/* or */
    452 		regtail(ret, regnode(NOTHING));		/* null. */
    453 	} else if (op == '?') {
    454 		/* Emit x? as (x|) */
    455 		reginsert(BRANCH, ret);			/* Either x */
    456 		regtail(ret, regnode(BRANCH));		/* or */
    457 		next = regnode(NOTHING);		/* null. */
    458 		regtail(ret, next);
    459 		regoptail(ret, next);
    460 	}
    461 	regparse++;
    462 	if (ISMULT(*regparse))
    463 		FAIL("nested *?+");
    464 
    465 	return(ret);
    466 }
    467 
    468 /*
    469  - regatom - the lowest level
    470  *
    471  * Optimization:  gobbles an entire sequence of ordinary characters so that
    472  * it can turn them into a single node, which is smaller to store and
    473  * faster to run.  Backslashed characters are exceptions, each becoming a
    474  * separate node; the code is simpler that way and it's not worth fixing.
    475  */
    476 static char *
    477 regatom(flagp)
    478 int *flagp;
    479 {
    480 	register char *ret;
    481 	int flags;
    482 
    483 	*flagp = WORST;		/* Tentatively. */
    484 
    485 	switch (*regparse++) {
    486 	case '^':
    487 		ret = regnode(BOL);
    488 		break;
    489 	case '$':
    490 		ret = regnode(EOL);
    491 		break;
    492 	case '.':
    493 		ret = regnode(ANY);
    494 		*flagp |= HASWIDTH|SIMPLE;
    495 		break;
    496 	case '[': {
    497 			register int clss;
    498 			register int classend;
    499 
    500 			if (*regparse == '^') {	/* Complement of range. */
    501 				ret = regnode(ANYBUT);
    502 				regparse++;
    503 			} else
    504 				ret = regnode(ANYOF);
    505 			if (*regparse == ']' || *regparse == '-')
    506 				regc(*regparse++);
    507 			while (*regparse != '\0' && *regparse != ']') {
    508 				if (*regparse == '-') {
    509 					regparse++;
    510 					if (*regparse == ']' || *regparse == '\0')
    511 						regc('-');
    512 					else {
    513 						clss = UCHARAT(regparse-2)+1;
    514 						classend = UCHARAT(regparse);
    515 						if (clss > classend+1)
    516 							FAIL("invalid [] range");
    517 						for (; clss <= classend; clss++)
    518 							regc(clss);
    519 						regparse++;
    520 					}
    521 				} else
    522 					regc(*regparse++);
    523 			}
    524 			regc('\0');
    525 			if (*regparse != ']')
    526 				FAIL("unmatched []");
    527 			regparse++;
    528 			*flagp |= HASWIDTH|SIMPLE;
    529 		}
    530 		break;
    531 	case '(':
    532 		ret = reg(1, &flags);
    533 		if (ret == NULL)
    534 			return(NULL);
    535 		*flagp |= flags&(HASWIDTH|SPSTART);
    536 		break;
    537 	case '\0':
    538 	case '|':
    539 	case ')':
    540 		FAIL("internal urp");	/* Supposed to be caught earlier. */
    541 		/* NOTREACHED */
    542 		break;
    543 	case '?':
    544 	case '+':
    545 	case '*':
    546 		FAIL("?+* follows nothing");
    547 		/* NOTREACHED */
    548 		break;
    549 	case '\\':
    550 		if (*regparse == '\0')
    551 			FAIL("trailing \\");
    552 		ret = regnode(EXACTLY);
    553 		regc(*regparse++);
    554 		regc('\0');
    555 		*flagp |= HASWIDTH|SIMPLE;
    556 		break;
    557 	default: {
    558 			register int len;
    559 			register char ender;
    560 
    561 			regparse--;
    562 			len = (int) strcspn(regparse, META);
    563 			if (len <= 0)
    564 				FAIL("internal disaster");
    565 			ender = *(regparse+len);
    566 			if (len > 1 && ISMULT(ender))
    567 				len--;		/* Back off clear of ?+* operand. */
    568 			*flagp |= HASWIDTH;
    569 			if (len == 1)
    570 				*flagp |= SIMPLE;
    571 			ret = regnode(EXACTLY);
    572 			while (len > 0) {
    573 				regc(*regparse++);
    574 				len--;
    575 			}
    576 			regc('\0');
    577 		}
    578 		break;
    579 	}
    580 
    581 	return(ret);
    582 }
    583 
    584 /*
    585  - regnode - emit a node
    586  */
    587 static char *			/* Location. */
    588 regnode(op)
    589 char op;
    590 {
    591 	register char *ret;
    592 	register char *ptr;
    593 
    594 	ret = regcode;
    595 	if (ret == &regdummy) {
    596 		regsize += 3;
    597 		return(ret);
    598 	}
    599 
    600 	ptr = ret;
    601 	*ptr++ = op;
    602 	*ptr++ = '\0';		/* Null "next" pointer. */
    603 	*ptr++ = '\0';
    604 	regcode = ptr;
    605 
    606 	return(ret);
    607 }
    608 
    609 /*
    610  - regc - emit (if appropriate) a byte of code
    611  */
    612 static void
    613 regc(b)
    614 char b;
    615 {
    616 	if (regcode != &regdummy)
    617 		*regcode++ = b;
    618 	else
    619 		regsize++;
    620 }
    621 
    622 /*
    623  - reginsert - insert an operator in front of already-emitted operand
    624  *
    625  * Means relocating the operand.
    626  */
    627 static void
    628 reginsert(op, opnd)
    629 char op;
    630 char *opnd;
    631 {
    632 	register char *src;
    633 	register char *dst;
    634 	register char *place;
    635 
    636 	if (regcode == &regdummy) {
    637 		regsize += 3;
    638 		return;
    639 	}
    640 
    641 	src = regcode;
    642 	regcode += 3;
    643 	dst = regcode;
    644 	while (src > opnd)
    645 		*--dst = *--src;
    646 
    647 	place = opnd;		/* Op node, where operand used to be. */
    648 	*place++ = op;
    649 	*place++ = '\0';
    650 	*place++ = '\0';
    651 }
    652 
    653 /*
    654  - regtail - set the next-pointer at the end of a node chain
    655  */
    656 static void
    657 regtail(p, val)
    658 char *p;
    659 char *val;
    660 {
    661 	register char *scan;
    662 	register char *temp;
    663 	register int offset;
    664 
    665 	if (p == &regdummy)
    666 		return;
    667 
    668 	/* Find last node. */
    669 	scan = p;
    670 	for (;;) {
    671 		temp = regnext(scan);
    672 		if (temp == NULL)
    673 			break;
    674 		scan = temp;
    675 	}
    676 
    677 	if (OP(scan) == BACK)
    678 		offset = (int) (scan - val);
    679 	else
    680 		offset = (int) (val - scan);
    681 	*(scan+1) = (offset>>8)&0377;
    682 	*(scan+2) = offset&0377;
    683 }
    684 
    685 /*
    686  - regoptail - regtail on operand of first argument; nop if operandless
    687  */
    688 static void
    689 regoptail(p, val)
    690 char *p;
    691 char *val;
    692 {
    693 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
    694 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
    695 		return;
    696 	regtail(OPERAND(p), val);
    697 }
    698 
    699 /*
    700  * regexec and friends
    701  */
    702 
    703 /*
    704  * Global work variables for regexec().
    705  */
    706 static char *reginput;		/* String-input pointer. */
    707 static char *regbol;		/* Beginning of input, for ^ check. */
    708 static char **regstartp;	/* Pointer to startp array. */
    709 static char **regendp;		/* Ditto for endp. */
    710 
    711 /*
    712  * Forwards.
    713  */
    714 STATIC int regtry();
    715 STATIC int regmatch();
    716 STATIC int regrepeat();
    717 
    718 #ifdef DEBUG
    719 int regnarrate = 0;
    720 void regdump();
    721 STATIC char *regprop();
    722 #endif
    723 
    724 /*
    725  - regexec - match a regexp against a string
    726  */
    727 int
    728 regexec2(prog, string, notbol)
    729 register regexp *prog;
    730 register char *string;
    731 int notbol;
    732 {
    733 	register char *s;
    734 
    735 	/* Be paranoid... */
    736 	if (prog == NULL || string == NULL) {
    737 		regerror("NULL parameter");
    738 		return(0);
    739 	}
    740 
    741 	/* Check validity of program. */
    742 	if (UCHARAT(prog->program) != MAGIC) {
    743 		regerror("corrupted program");
    744 		return(0);
    745 	}
    746 
    747 	/* If there is a "must appear" string, look for it. */
    748 	if (prog->regmust != NULL) {
    749 		s = string;
    750 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
    751 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
    752 				break;	/* Found it. */
    753 			s++;
    754 		}
    755 		if (s == NULL)	/* Not present. */
    756 			return(0);
    757 	}
    758 
    759 	/* Mark beginning of line for ^ . */
    760 	if (notbol)
    761 		regbol = NULL;
    762 	else
    763 		regbol = string;
    764 
    765 	/* Simplest case:  anchored match need be tried only once. */
    766 	if (prog->reganch)
    767 		return(regtry(prog, string));
    768 
    769 	/* Messy cases:  unanchored match. */
    770 	s = string;
    771 	if (prog->regstart != '\0')
    772 		/* We know what char it must start with. */
    773 		while ((s = strchr(s, prog->regstart)) != NULL) {
    774 			if (regtry(prog, s))
    775 				return(1);
    776 			s++;
    777 		}
    778 	else
    779 		/* We don't -- general case. */
    780 		do {
    781 			if (regtry(prog, s))
    782 				return(1);
    783 		} while (*s++ != '\0');
    784 
    785 	/* Failure. */
    786 	return(0);
    787 }
    788 
    789 int
    790 regexec(prog, string)
    791 register regexp *prog;
    792 register char *string;
    793 {
    794 	return regexec2(prog, string, 0);
    795 }
    796 
    797 /*
    798  - regtry - try match at specific point
    799  */
    800 static int			/* 0 failure, 1 success */
    801 regtry(prog, string)
    802 regexp *prog;
    803 char *string;
    804 {
    805 	register int i;
    806 	register char **sp;
    807 	register char **ep;
    808 
    809 	reginput = string;
    810 	regstartp = prog->startp;
    811 	regendp = prog->endp;
    812 
    813 	sp = prog->startp;
    814 	ep = prog->endp;
    815 	for (i = NSUBEXP; i > 0; i--) {
    816 		*sp++ = NULL;
    817 		*ep++ = NULL;
    818 	}
    819 	if (regmatch(prog->program + 1)) {
    820 		prog->startp[0] = string;
    821 		prog->endp[0] = reginput;
    822 		return(1);
    823 	} else
    824 		return(0);
    825 }
    826 
    827 /*
    828  - regmatch - main matching routine
    829  *
    830  * Conceptually the strategy is simple:  check to see whether the current
    831  * node matches, call self recursively to see whether the rest matches,
    832  * and then act accordingly.  In practice we make some effort to avoid
    833  * recursion, in particular by going through "ordinary" nodes (that don't
    834  * need to know whether the rest of the match failed) by a loop instead of
    835  * by recursion.
    836  */
    837 static int			/* 0 failure, 1 success */
    838 regmatch(prog)
    839 char *prog;
    840 {
    841 	register char *scan;	/* Current node. */
    842 	char *next;		/* Next node. */
    843 
    844 	scan = prog;
    845 #ifdef DEBUG
    846 	if (scan != NULL && regnarrate)
    847 		fprintf(stderr, "%s(\n", regprop(scan));
    848 #endif
    849 	while (scan != NULL) {
    850 #ifdef DEBUG
    851 		if (regnarrate)
    852 			fprintf(stderr, "%s...\n", regprop(scan));
    853 #endif
    854 		next = regnext(scan);
    855 
    856 		switch (OP(scan)) {
    857 		case BOL:
    858 			if (reginput != regbol)
    859 				return(0);
    860 			break;
    861 		case EOL:
    862 			if (*reginput != '\0')
    863 				return(0);
    864 			break;
    865 		case ANY:
    866 			if (*reginput == '\0')
    867 				return(0);
    868 			reginput++;
    869 			break;
    870 		case EXACTLY: {
    871 				register int len;
    872 				register char *opnd;
    873 
    874 				opnd = OPERAND(scan);
    875 				/* Inline the first character, for speed. */
    876 				if (*opnd != *reginput)
    877 					return(0);
    878 				len = (int) strlen(opnd);
    879 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
    880 					return(0);
    881 				reginput += len;
    882 			}
    883 			break;
    884 		case ANYOF:
    885  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
    886 				return(0);
    887 			reginput++;
    888 			break;
    889 		case ANYBUT:
    890  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
    891 				return(0);
    892 			reginput++;
    893 			break;
    894 		case NOTHING:
    895 			break;
    896 		case BACK:
    897 			break;
    898 		case OPEN+1:
    899 		case OPEN+2:
    900 		case OPEN+3:
    901 		case OPEN+4:
    902 		case OPEN+5:
    903 		case OPEN+6:
    904 		case OPEN+7:
    905 		case OPEN+8:
    906 		case OPEN+9: {
    907 				register int no;
    908 				register char *save;
    909 
    910 				no = OP(scan) - OPEN;
    911 				save = reginput;
    912 
    913 				if (regmatch(next)) {
    914 					/*
    915 					 * Don't set startp if some later
    916 					 * invocation of the same parentheses
    917 					 * already has.
    918 					 */
    919 					if (regstartp[no] == NULL)
    920 						regstartp[no] = save;
    921 					return(1);
    922 				} else
    923 					return(0);
    924 			}
    925 			/* NOTREACHED */
    926 			break;
    927 		case CLOSE+1:
    928 		case CLOSE+2:
    929 		case CLOSE+3:
    930 		case CLOSE+4:
    931 		case CLOSE+5:
    932 		case CLOSE+6:
    933 		case CLOSE+7:
    934 		case CLOSE+8:
    935 		case CLOSE+9: {
    936 				register int no;
    937 				register char *save;
    938 
    939 				no = OP(scan) - CLOSE;
    940 				save = reginput;
    941 
    942 				if (regmatch(next)) {
    943 					/*
    944 					 * Don't set endp if some later
    945 					 * invocation of the same parentheses
    946 					 * already has.
    947 					 */
    948 					if (regendp[no] == NULL)
    949 						regendp[no] = save;
    950 					return(1);
    951 				} else
    952 					return(0);
    953 			}
    954 			/* NOTREACHED */
    955 			break;
    956 		case BRANCH: {
    957 				register char *save;
    958 
    959 				if (OP(next) != BRANCH)		/* No choice. */
    960 					next = OPERAND(scan);	/* Avoid recursion. */
    961 				else {
    962 					do {
    963 						save = reginput;
    964 						if (regmatch(OPERAND(scan)))
    965 							return(1);
    966 						reginput = save;
    967 						scan = regnext(scan);
    968 					} while (scan != NULL && OP(scan) == BRANCH);
    969 					return(0);
    970 					/* NOTREACHED */
    971 				}
    972 			}
    973 			/* NOTREACHED */
    974 			break;
    975 		case STAR:
    976 		case PLUS: {
    977 				register char nextch;
    978 				register int no;
    979 				register char *save;
    980 				register int min;
    981 
    982 				/*
    983 				 * Lookahead to avoid useless match attempts
    984 				 * when we know what character comes next.
    985 				 */
    986 				nextch = '\0';
    987 				if (OP(next) == EXACTLY)
    988 					nextch = *OPERAND(next);
    989 				min = (OP(scan) == STAR) ? 0 : 1;
    990 				save = reginput;
    991 				no = regrepeat(OPERAND(scan));
    992 				while (no >= min) {
    993 					/* If it could work, try it. */
    994 					if (nextch == '\0' || *reginput == nextch)
    995 						if (regmatch(next))
    996 							return(1);
    997 					/* Couldn't or didn't -- back up. */
    998 					no--;
    999 					reginput = save + no;
   1000 				}
   1001 				return(0);
   1002 			}
   1003 			/* NOTREACHED */
   1004 			break;
   1005 		case END:
   1006 			return(1);	/* Success! */
   1007 			/* NOTREACHED */
   1008 			break;
   1009 		default:
   1010 			regerror("memory corruption");
   1011 			return(0);
   1012 			/* NOTREACHED */
   1013 			break;
   1014 		}
   1015 
   1016 		scan = next;
   1017 	}
   1018 
   1019 	/*
   1020 	 * We get here only if there's trouble -- normally "case END" is
   1021 	 * the terminating point.
   1022 	 */
   1023 	regerror("corrupted pointers");
   1024 	return(0);
   1025 }
   1026 
   1027 /*
   1028  - regrepeat - repeatedly match something simple, report how many
   1029  */
   1030 static int
   1031 regrepeat(p)
   1032 char *p;
   1033 {
   1034 	register int count = 0;
   1035 	register char *scan;
   1036 	register char *opnd;
   1037 
   1038 	scan = reginput;
   1039 	opnd = OPERAND(p);
   1040 	switch (OP(p)) {
   1041 	case ANY:
   1042 		count = (int) strlen(scan);
   1043 		scan += count;
   1044 		break;
   1045 	case EXACTLY:
   1046 		while (*opnd == *scan) {
   1047 			count++;
   1048 			scan++;
   1049 		}
   1050 		break;
   1051 	case ANYOF:
   1052 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
   1053 			count++;
   1054 			scan++;
   1055 		}
   1056 		break;
   1057 	case ANYBUT:
   1058 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
   1059 			count++;
   1060 			scan++;
   1061 		}
   1062 		break;
   1063 	default:		/* Oh dear.  Called inappropriately. */
   1064 		regerror("internal foulup");
   1065 		count = 0;	/* Best compromise. */
   1066 		break;
   1067 	}
   1068 	reginput = scan;
   1069 
   1070 	return(count);
   1071 }
   1072 
   1073 /*
   1074  - regnext - dig the "next" pointer out of a node
   1075  */
   1076 static char *
   1077 regnext(p)
   1078 register char *p;
   1079 {
   1080 	register int offset;
   1081 
   1082 	if (p == &regdummy)
   1083 		return(NULL);
   1084 
   1085 	offset = NEXT(p);
   1086 	if (offset == 0)
   1087 		return(NULL);
   1088 
   1089 	if (OP(p) == BACK)
   1090 		return(p-offset);
   1091 	else
   1092 		return(p+offset);
   1093 }
   1094 
   1095 #ifdef DEBUG
   1096 
   1097 STATIC char *regprop();
   1098 
   1099 /*
   1100  - regdump - dump a regexp onto stdout in vaguely comprehensible form
   1101  */
   1102 void
   1103 regdump(r)
   1104 regexp *r;
   1105 {
   1106 	register char *s;
   1107 	register char op = EXACTLY;	/* Arbitrary non-END op. */
   1108 	register char *next;
   1109 
   1110 
   1111 	s = r->program + 1;
   1112 	while (op != END) {	/* While that wasn't END last time... */
   1113 		op = OP(s);
   1114 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
   1115 		next = regnext(s);
   1116 		if (next == NULL)		/* Next ptr. */
   1117 			printf("(0)");
   1118 		else
   1119 			printf("(%d)", (s-r->program)+(next-s));
   1120 		s += 3;
   1121 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
   1122 			/* Literal string, where present. */
   1123 			while (*s != '\0') {
   1124 				putchar(*s);
   1125 				s++;
   1126 			}
   1127 			s++;
   1128 		}
   1129 		putchar('\n');
   1130 	}
   1131 
   1132 	/* Header fields of interest. */
   1133 	if (r->regstart != '\0')
   1134 		printf("start `%c' ", r->regstart);
   1135 	if (r->reganch)
   1136 		printf("anchored ");
   1137 	if (r->regmust != NULL)
   1138 		printf("must have \"%s\"", r->regmust);
   1139 	printf("\n");
   1140 }
   1141 
   1142 /*
   1143  - regprop - printable representation of opcode
   1144  */
   1145 static char *
   1146 regprop(op)
   1147 char *op;
   1148 {
   1149 	register char *p;
   1150 	static char buf[50];
   1151 
   1152 	(void) strcpy(buf, ":");
   1153 
   1154 	switch (OP(op)) {
   1155 	case BOL:
   1156 		p = "BOL";
   1157 		break;
   1158 	case EOL:
   1159 		p = "EOL";
   1160 		break;
   1161 	case ANY:
   1162 		p = "ANY";
   1163 		break;
   1164 	case ANYOF:
   1165 		p = "ANYOF";
   1166 		break;
   1167 	case ANYBUT:
   1168 		p = "ANYBUT";
   1169 		break;
   1170 	case BRANCH:
   1171 		p = "BRANCH";
   1172 		break;
   1173 	case EXACTLY:
   1174 		p = "EXACTLY";
   1175 		break;
   1176 	case NOTHING:
   1177 		p = "NOTHING";
   1178 		break;
   1179 	case BACK:
   1180 		p = "BACK";
   1181 		break;
   1182 	case END:
   1183 		p = "END";
   1184 		break;
   1185 	case OPEN+1:
   1186 	case OPEN+2:
   1187 	case OPEN+3:
   1188 	case OPEN+4:
   1189 	case OPEN+5:
   1190 	case OPEN+6:
   1191 	case OPEN+7:
   1192 	case OPEN+8:
   1193 	case OPEN+9:
   1194 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
   1195 		p = NULL;
   1196 		break;
   1197 	case CLOSE+1:
   1198 	case CLOSE+2:
   1199 	case CLOSE+3:
   1200 	case CLOSE+4:
   1201 	case CLOSE+5:
   1202 	case CLOSE+6:
   1203 	case CLOSE+7:
   1204 	case CLOSE+8:
   1205 	case CLOSE+9:
   1206 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
   1207 		p = NULL;
   1208 		break;
   1209 	case STAR:
   1210 		p = "STAR";
   1211 		break;
   1212 	case PLUS:
   1213 		p = "PLUS";
   1214 		break;
   1215 	default:
   1216 		regerror("corrupted opcode");
   1217 		break;
   1218 	}
   1219 	if (p != NULL)
   1220 		(void) strcat(buf, p);
   1221 	return(buf);
   1222 }
   1223 #endif
   1224 
   1225 /*
   1226  * The following is provided for those people who do not have strcspn() in
   1227  * their C libraries.  They should get off their butts and do something
   1228  * about it; at least one public-domain implementation of those (highly
   1229  * useful) string routines has been published on Usenet.
   1230  */
   1231 #ifdef STRCSPN
   1232 /*
   1233  * strcspn - find length of initial segment of s1 consisting entirely
   1234  * of characters not from s2
   1235  */
   1236 
   1237 static int
   1238 strcspn(s1, s2)
   1239 char *s1;
   1240 char *s2;
   1241 {
   1242 	register char *scan1;
   1243 	register char *scan2;
   1244 	register int count;
   1245 
   1246 	count = 0;
   1247 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
   1248 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
   1249 			if (*scan1 == *scan2++)
   1250 				return(count);
   1251 		count++;
   1252 	}
   1253 	return(count);
   1254 }
   1255 #endif
   1256