regexp.c revision 1.19 1 1.1 cgd /*
2 1.1 cgd * regcomp and regexec -- regsub and regerror are elsewhere
3 1.1 cgd *
4 1.1 cgd * Copyright (c) 1986 by University of Toronto.
5 1.1 cgd * Written by Henry Spencer. Not derived from licensed software.
6 1.1 cgd *
7 1.1 cgd * Permission is granted to anyone to use this software for any
8 1.1 cgd * purpose on any computer system, and to redistribute it freely,
9 1.1 cgd * subject to the following restrictions:
10 1.1 cgd *
11 1.1 cgd * 1. The author is not responsible for the consequences of use of
12 1.1 cgd * this software, no matter how awful, even if they arise
13 1.1 cgd * from defects in it.
14 1.1 cgd *
15 1.1 cgd * 2. The origin of this software must not be misrepresented, either
16 1.1 cgd * by explicit claim or by omission.
17 1.1 cgd *
18 1.1 cgd * 3. Altered versions must be plainly marked as such, and must not
19 1.1 cgd * be misrepresented as being the original software.
20 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 1.1 cgd *** to assist in implementing egrep.
23 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 1.1 cgd *** as in BSD grep and ex.
26 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 1.1 cgd *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 1.1 cgd *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 1.1 cgd *
31 1.1 cgd * Beware that some of this code is subtly aware of the way operator
32 1.1 cgd * precedence is structured in regular expressions. Serious changes in
33 1.1 cgd * regular-expression syntax might require a total rethink.
34 1.1 cgd */
35 1.2 mycroft
36 1.7 lukem #include <sys/cdefs.h>
37 1.14 msaitoh #if defined(LIBC_SCCS) && !defined(lint)
38 1.19 christos __RCSID("$NetBSD: regexp.c,v 1.19 2016/01/26 16:05:18 christos Exp $");
39 1.14 msaitoh #endif /* LIBC_SCCS and not lint */
40 1.2 mycroft
41 1.12 lukem #include <ctype.h>
42 1.1 cgd #include <regexp.h>
43 1.1 cgd #include <stdio.h>
44 1.1 cgd #include <stdlib.h>
45 1.1 cgd #include <string.h>
46 1.1 cgd #include "regmagic.h"
47 1.1 cgd
48 1.1 cgd /*
49 1.1 cgd * The "internal use only" fields in regexp.h are present to pass info from
50 1.1 cgd * compile to execute that permits the execute phase to run lots faster on
51 1.1 cgd * simple cases. They are:
52 1.1 cgd *
53 1.1 cgd * regstart char that must begin a match; '\0' if none obvious
54 1.1 cgd * reganch is the match anchored (at beginning-of-line only)?
55 1.1 cgd * regmust string (pointer into program) that match must include, or NULL
56 1.1 cgd * regmlen length of regmust string
57 1.1 cgd *
58 1.1 cgd * Regstart and reganch permit very fast decisions on suitable starting points
59 1.1 cgd * for a match, cutting down the work a lot. Regmust permits fast rejection
60 1.1 cgd * of lines that cannot possibly match. The regmust tests are costly enough
61 1.1 cgd * that regcomp() supplies a regmust only if the r.e. contains something
62 1.1 cgd * potentially expensive (at present, the only such thing detected is * or +
63 1.1 cgd * at the start of the r.e., which can involve a lot of backup). Regmlen is
64 1.1 cgd * supplied because the test in regexec() needs it and regcomp() is computing
65 1.1 cgd * it anyway.
66 1.1 cgd */
67 1.1 cgd
68 1.1 cgd /*
69 1.1 cgd * Structure for regexp "program". This is essentially a linear encoding
70 1.1 cgd * of a nondeterministic finite-state machine (aka syntax charts or
71 1.1 cgd * "railroad normal form" in parsing technology). Each node is an opcode
72 1.1 cgd * plus a "next" pointer, possibly plus an operand. "Next" pointers of
73 1.1 cgd * all nodes except BRANCH implement concatenation; a "next" pointer with
74 1.1 cgd * a BRANCH on both ends of it is connecting two alternatives. (Here we
75 1.1 cgd * have one of the subtle syntax dependencies: an individual BRANCH (as
76 1.1 cgd * opposed to a collection of them) is never concatenated with anything
77 1.1 cgd * because of operator precedence.) The operand of some types of node is
78 1.1 cgd * a literal string; for others, it is a node leading into a sub-FSM. In
79 1.1 cgd * particular, the operand of a BRANCH node is the first node of the branch.
80 1.1 cgd * (NB this is *not* a tree structure: the tail of the branch connects
81 1.1 cgd * to the thing following the set of BRANCHes.) The opcodes are:
82 1.1 cgd */
83 1.1 cgd
84 1.1 cgd /* definition number opnd? meaning */
85 1.1 cgd #define END 0 /* no End of program. */
86 1.1 cgd #define BOL 1 /* no Match "" at beginning of line. */
87 1.1 cgd #define EOL 2 /* no Match "" at end of line. */
88 1.1 cgd #define ANY 3 /* no Match any one character. */
89 1.1 cgd #define ANYOF 4 /* str Match any character in this string. */
90 1.1 cgd #define ANYBUT 5 /* str Match any character not in this string. */
91 1.1 cgd #define BRANCH 6 /* node Match this alternative, or the next... */
92 1.1 cgd #define BACK 7 /* no Match "", "next" ptr points backward. */
93 1.1 cgd #define EXACTLY 8 /* str Match this string. */
94 1.1 cgd #define NOTHING 9 /* no Match empty string. */
95 1.1 cgd #define STAR 10 /* node Match this (simple) thing 0 or more times. */
96 1.1 cgd #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
97 1.1 cgd #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
98 1.1 cgd #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
99 1.1 cgd #define OPEN 20 /* no Mark this point in input as start of #n. */
100 1.1 cgd /* OPEN+1 is number 1, etc. */
101 1.1 cgd #define CLOSE 30 /* no Analogous to OPEN. */
102 1.1 cgd
103 1.1 cgd /*
104 1.1 cgd * Opcode notes:
105 1.1 cgd *
106 1.1 cgd * BRANCH The set of branches constituting a single choice are hooked
107 1.1 cgd * together with their "next" pointers, since precedence prevents
108 1.1 cgd * anything being concatenated to any individual branch. The
109 1.1 cgd * "next" pointer of the last BRANCH in a choice points to the
110 1.1 cgd * thing following the whole choice. This is also where the
111 1.1 cgd * final "next" pointer of each individual branch points; each
112 1.1 cgd * branch starts with the operand node of a BRANCH node.
113 1.1 cgd *
114 1.1 cgd * BACK Normal "next" pointers all implicitly point forward; BACK
115 1.1 cgd * exists to make loop structures possible.
116 1.1 cgd *
117 1.1 cgd * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
118 1.1 cgd * BRANCH structures using BACK. Simple cases (one character
119 1.1 cgd * per match) are implemented with STAR and PLUS for speed
120 1.1 cgd * and to minimize recursive plunges.
121 1.1 cgd *
122 1.1 cgd * OPEN,CLOSE ...are numbered at compile time.
123 1.1 cgd */
124 1.1 cgd
125 1.1 cgd /*
126 1.1 cgd * A node is one char of opcode followed by two chars of "next" pointer.
127 1.1 cgd * "Next" pointers are stored as two 8-bit pieces, high order first. The
128 1.1 cgd * value is a positive offset from the opcode of the node containing it.
129 1.1 cgd * An operand, if any, simply follows the node. (Note that much of the
130 1.1 cgd * code generation knows about this implicit relationship.)
131 1.1 cgd *
132 1.1 cgd * Using two bytes for the "next" pointer is vast overkill for most things,
133 1.1 cgd * but allows patterns to get big without disasters.
134 1.1 cgd */
135 1.1 cgd #define OP(p) (*(p))
136 1.1 cgd #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
137 1.1 cgd #define OPERAND(p) ((p) + 3)
138 1.1 cgd
139 1.1 cgd /*
140 1.1 cgd * See regmagic.h for one further detail of program structure.
141 1.1 cgd */
142 1.1 cgd
143 1.1 cgd
144 1.1 cgd /*
145 1.1 cgd * Utility definitions.
146 1.1 cgd */
147 1.1 cgd #ifndef CHARBITS
148 1.1 cgd #define UCHARAT(p) ((int)*(unsigned char *)(p))
149 1.1 cgd #else
150 1.1 cgd #define UCHARAT(p) ((int)*(p)&CHARBITS)
151 1.1 cgd #endif
152 1.1 cgd
153 1.1 cgd #define FAIL(m) { regerror(m); return(NULL); }
154 1.1 cgd #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
155 1.1 cgd
156 1.1 cgd /*
157 1.1 cgd * Flags to be passed up and down.
158 1.1 cgd */
159 1.1 cgd #define HASWIDTH 01 /* Known never to match null string. */
160 1.1 cgd #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
161 1.1 cgd #define SPSTART 04 /* Starts with * or +. */
162 1.1 cgd #define WORST 0 /* Worst case. */
163 1.1 cgd
164 1.1 cgd /*
165 1.1 cgd * Global work variables for regcomp().
166 1.1 cgd */
167 1.1 cgd static char *regparse; /* Input-scan pointer. */
168 1.1 cgd static int regnpar; /* () count. */
169 1.1 cgd static char regdummy;
170 1.1 cgd static char *regcode; /* Code-emit pointer; ®dummy = don't. */
171 1.1 cgd static long regsize; /* Code size. */
172 1.1 cgd
173 1.1 cgd /*
174 1.1 cgd * Forward declarations for regcomp()'s friends.
175 1.1 cgd */
176 1.1 cgd #ifndef STATIC
177 1.1 cgd #define STATIC static
178 1.1 cgd #endif
179 1.4 pk STATIC char *reg __P((int, int *));
180 1.4 pk STATIC char *regbranch __P((int *));
181 1.4 pk STATIC char *regpiece __P((int *));
182 1.4 pk STATIC char *regatom __P((int *));
183 1.10 christos STATIC char *regnode __P((int));
184 1.4 pk STATIC char *regnext __P((char *));
185 1.10 christos STATIC void regc __P((int));
186 1.10 christos STATIC void reginsert __P((int, char *));
187 1.4 pk STATIC void regtail __P((char *, char *));
188 1.4 pk STATIC void regoptail __P((char *, char *));
189 1.1 cgd #ifdef STRCSPN
190 1.4 pk STATIC int strcspn __P((char *, char *));
191 1.1 cgd #endif
192 1.1 cgd
193 1.1 cgd /*
194 1.1 cgd - regcomp - compile a regular expression into internal code
195 1.1 cgd *
196 1.1 cgd * We can't allocate space until we know how big the compiled form will be,
197 1.1 cgd * but we can't compile it (and thus know how big it is) until we've got a
198 1.1 cgd * place to put the code. So we cheat: we compile it twice, once with code
199 1.1 cgd * generation turned off and size counting turned on, and once "for real".
200 1.1 cgd * This also means that we don't allocate space until we are sure that the
201 1.1 cgd * thing really will compile successfully, and we never have to move the
202 1.1 cgd * code and thus invalidate pointers into it. (Note that it has to be in
203 1.1 cgd * one piece because free() must be able to free it all.)
204 1.1 cgd *
205 1.1 cgd * Beware that the optimization-preparation code in here knows about some
206 1.1 cgd * of the structure of the compiled regexp.
207 1.1 cgd */
208 1.1 cgd regexp *
209 1.15 thorpej __compat_regcomp(expn)
210 1.15 thorpej const char *expn;
211 1.1 cgd {
212 1.8 perry regexp *r;
213 1.8 perry char *scan;
214 1.8 perry char *longest;
215 1.8 perry int len;
216 1.1 cgd int flags;
217 1.1 cgd
218 1.15 thorpej if (expn == NULL)
219 1.1 cgd FAIL("NULL argument");
220 1.1 cgd
221 1.1 cgd /* First pass: determine size, legality. */
222 1.1 cgd #ifdef notdef
223 1.15 thorpej if (expn[0] == '.' && expn[1] == '*') expn += 2; /* aid grep */
224 1.1 cgd #endif
225 1.10 christos /* LINTED const castaway */
226 1.15 thorpej regparse = (char *)expn;
227 1.1 cgd regnpar = 1;
228 1.1 cgd regsize = 0L;
229 1.1 cgd regcode = ®dummy;
230 1.1 cgd regc(MAGIC);
231 1.1 cgd if (reg(0, &flags) == NULL)
232 1.1 cgd return(NULL);
233 1.1 cgd
234 1.1 cgd /* Small enough for pointer-storage convention? */
235 1.1 cgd if (regsize >= 32767L) /* Probably could be 65535L. */
236 1.1 cgd FAIL("regexp too big");
237 1.1 cgd
238 1.1 cgd /* Allocate space. */
239 1.19 christos r = malloc(sizeof(regexp) + (unsigned)regsize);
240 1.1 cgd if (r == NULL)
241 1.1 cgd FAIL("out of space");
242 1.1 cgd
243 1.1 cgd /* Second pass: emit code. */
244 1.10 christos /* LINTED const castaway */
245 1.15 thorpej regparse = (char *)expn;
246 1.1 cgd regnpar = 1;
247 1.1 cgd regcode = r->program;
248 1.1 cgd regc(MAGIC);
249 1.19 christos if (reg(0, &flags) == NULL) {
250 1.19 christos free(r);
251 1.1 cgd return(NULL);
252 1.19 christos }
253 1.1 cgd
254 1.1 cgd /* Dig out information for optimizations. */
255 1.1 cgd r->regstart = '\0'; /* Worst-case defaults. */
256 1.1 cgd r->reganch = 0;
257 1.1 cgd r->regmust = NULL;
258 1.1 cgd r->regmlen = 0;
259 1.1 cgd scan = r->program+1; /* First BRANCH. */
260 1.1 cgd if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
261 1.1 cgd scan = OPERAND(scan);
262 1.1 cgd
263 1.1 cgd /* Starting-point info. */
264 1.1 cgd if (OP(scan) == EXACTLY)
265 1.1 cgd r->regstart = *OPERAND(scan);
266 1.1 cgd else if (OP(scan) == BOL)
267 1.1 cgd r->reganch++;
268 1.1 cgd
269 1.1 cgd /*
270 1.1 cgd * If there's something expensive in the r.e., find the
271 1.1 cgd * longest literal string that must appear and make it the
272 1.1 cgd * regmust. Resolve ties in favor of later strings, since
273 1.1 cgd * the regstart check works with the beginning of the r.e.
274 1.1 cgd * and avoiding duplication strengthens checking. Not a
275 1.1 cgd * strong reason, but sufficient in the absence of others.
276 1.1 cgd */
277 1.1 cgd if (flags&SPSTART) {
278 1.1 cgd longest = NULL;
279 1.1 cgd len = 0;
280 1.1 cgd for (; scan != NULL; scan = regnext(scan))
281 1.15 thorpej if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) {
282 1.1 cgd longest = OPERAND(scan);
283 1.1 cgd len = strlen(OPERAND(scan));
284 1.1 cgd }
285 1.1 cgd r->regmust = longest;
286 1.1 cgd r->regmlen = len;
287 1.1 cgd }
288 1.1 cgd }
289 1.1 cgd
290 1.1 cgd return(r);
291 1.1 cgd }
292 1.1 cgd
293 1.1 cgd /*
294 1.1 cgd - reg - regular expression, i.e. main body or parenthesized thing
295 1.1 cgd *
296 1.1 cgd * Caller must absorb opening parenthesis.
297 1.1 cgd *
298 1.1 cgd * Combining parenthesis handling with the base level of regular expression
299 1.1 cgd * is a trifle forced, but the need to tie the tails of the branches to what
300 1.1 cgd * follows makes it hard to avoid.
301 1.1 cgd */
302 1.1 cgd static char *
303 1.1 cgd reg(paren, flagp)
304 1.1 cgd int paren; /* Parenthesized? */
305 1.1 cgd int *flagp;
306 1.1 cgd {
307 1.8 perry char *ret;
308 1.8 perry char *br;
309 1.8 perry char *ender;
310 1.8 perry int parno = 0;
311 1.1 cgd int flags;
312 1.1 cgd
313 1.1 cgd *flagp = HASWIDTH; /* Tentatively. */
314 1.1 cgd
315 1.1 cgd /* Make an OPEN node, if parenthesized. */
316 1.1 cgd if (paren) {
317 1.1 cgd if (regnpar >= NSUBEXP)
318 1.1 cgd FAIL("too many ()");
319 1.1 cgd parno = regnpar;
320 1.1 cgd regnpar++;
321 1.1 cgd ret = regnode(OPEN+parno);
322 1.1 cgd } else
323 1.1 cgd ret = NULL;
324 1.1 cgd
325 1.1 cgd /* Pick up the branches, linking them together. */
326 1.1 cgd br = regbranch(&flags);
327 1.1 cgd if (br == NULL)
328 1.1 cgd return(NULL);
329 1.1 cgd if (ret != NULL)
330 1.1 cgd regtail(ret, br); /* OPEN -> first. */
331 1.1 cgd else
332 1.1 cgd ret = br;
333 1.1 cgd if (!(flags&HASWIDTH))
334 1.1 cgd *flagp &= ~HASWIDTH;
335 1.1 cgd *flagp |= flags&SPSTART;
336 1.1 cgd while (*regparse == '|' || *regparse == '\n') {
337 1.1 cgd regparse++;
338 1.1 cgd br = regbranch(&flags);
339 1.1 cgd if (br == NULL)
340 1.1 cgd return(NULL);
341 1.1 cgd regtail(ret, br); /* BRANCH -> BRANCH. */
342 1.1 cgd if (!(flags&HASWIDTH))
343 1.1 cgd *flagp &= ~HASWIDTH;
344 1.1 cgd *flagp |= flags&SPSTART;
345 1.1 cgd }
346 1.1 cgd
347 1.1 cgd /* Make a closing node, and hook it on the end. */
348 1.11 simonb ender = regnode((paren) ? CLOSE+parno : END);
349 1.1 cgd regtail(ret, ender);
350 1.1 cgd
351 1.1 cgd /* Hook the tails of the branches to the closing node. */
352 1.1 cgd for (br = ret; br != NULL; br = regnext(br))
353 1.1 cgd regoptail(br, ender);
354 1.1 cgd
355 1.1 cgd /* Check for proper termination. */
356 1.1 cgd if (paren && *regparse++ != ')') {
357 1.1 cgd FAIL("unmatched ()");
358 1.1 cgd } else if (!paren && *regparse != '\0') {
359 1.1 cgd if (*regparse == ')') {
360 1.1 cgd FAIL("unmatched ()");
361 1.1 cgd } else
362 1.1 cgd FAIL("junk on end"); /* "Can't happen". */
363 1.1 cgd /* NOTREACHED */
364 1.1 cgd }
365 1.1 cgd
366 1.1 cgd return(ret);
367 1.1 cgd }
368 1.1 cgd
369 1.1 cgd /*
370 1.1 cgd - regbranch - one alternative of an | operator
371 1.1 cgd *
372 1.1 cgd * Implements the concatenation operator.
373 1.1 cgd */
374 1.1 cgd static char *
375 1.1 cgd regbranch(flagp)
376 1.1 cgd int *flagp;
377 1.1 cgd {
378 1.8 perry char *ret;
379 1.8 perry char *chain;
380 1.8 perry char *latest;
381 1.1 cgd int flags;
382 1.1 cgd
383 1.1 cgd *flagp = WORST; /* Tentatively. */
384 1.1 cgd
385 1.1 cgd ret = regnode(BRANCH);
386 1.1 cgd chain = NULL;
387 1.1 cgd while (*regparse != '\0' && *regparse != ')' &&
388 1.1 cgd *regparse != '\n' && *regparse != '|') {
389 1.1 cgd latest = regpiece(&flags);
390 1.1 cgd if (latest == NULL)
391 1.1 cgd return(NULL);
392 1.1 cgd *flagp |= flags&HASWIDTH;
393 1.1 cgd if (chain == NULL) /* First piece. */
394 1.1 cgd *flagp |= flags&SPSTART;
395 1.1 cgd else
396 1.1 cgd regtail(chain, latest);
397 1.1 cgd chain = latest;
398 1.1 cgd }
399 1.1 cgd if (chain == NULL) /* Loop ran zero times. */
400 1.1 cgd (void) regnode(NOTHING);
401 1.1 cgd
402 1.1 cgd return(ret);
403 1.1 cgd }
404 1.1 cgd
405 1.1 cgd /*
406 1.1 cgd - regpiece - something followed by possible [*+?]
407 1.1 cgd *
408 1.1 cgd * Note that the branching code sequences used for ? and the general cases
409 1.1 cgd * of * and + are somewhat optimized: they use the same NOTHING node as
410 1.1 cgd * both the endmarker for their branch list and the body of the last branch.
411 1.1 cgd * It might seem that this node could be dispensed with entirely, but the
412 1.1 cgd * endmarker role is not redundant.
413 1.1 cgd */
414 1.1 cgd static char *
415 1.1 cgd regpiece(flagp)
416 1.1 cgd int *flagp;
417 1.1 cgd {
418 1.8 perry char *ret;
419 1.8 perry char op;
420 1.8 perry char *next;
421 1.1 cgd int flags;
422 1.1 cgd
423 1.1 cgd ret = regatom(&flags);
424 1.1 cgd if (ret == NULL)
425 1.1 cgd return(NULL);
426 1.1 cgd
427 1.1 cgd op = *regparse;
428 1.1 cgd if (!ISMULT(op)) {
429 1.1 cgd *flagp = flags;
430 1.1 cgd return(ret);
431 1.1 cgd }
432 1.1 cgd
433 1.1 cgd if (!(flags&HASWIDTH) && op != '?')
434 1.1 cgd FAIL("*+ operand could be empty");
435 1.1 cgd *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
436 1.1 cgd
437 1.1 cgd if (op == '*' && (flags&SIMPLE))
438 1.1 cgd reginsert(STAR, ret);
439 1.1 cgd else if (op == '*') {
440 1.1 cgd /* Emit x* as (x&|), where & means "self". */
441 1.1 cgd reginsert(BRANCH, ret); /* Either x */
442 1.1 cgd regoptail(ret, regnode(BACK)); /* and loop */
443 1.1 cgd regoptail(ret, ret); /* back */
444 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
445 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
446 1.1 cgd } else if (op == '+' && (flags&SIMPLE))
447 1.1 cgd reginsert(PLUS, ret);
448 1.1 cgd else if (op == '+') {
449 1.1 cgd /* Emit x+ as x(&|), where & means "self". */
450 1.1 cgd next = regnode(BRANCH); /* Either */
451 1.1 cgd regtail(ret, next);
452 1.1 cgd regtail(regnode(BACK), ret); /* loop back */
453 1.1 cgd regtail(next, regnode(BRANCH)); /* or */
454 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
455 1.1 cgd } else if (op == '?') {
456 1.1 cgd /* Emit x? as (x|) */
457 1.1 cgd reginsert(BRANCH, ret); /* Either x */
458 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
459 1.1 cgd next = regnode(NOTHING); /* null. */
460 1.1 cgd regtail(ret, next);
461 1.1 cgd regoptail(ret, next);
462 1.1 cgd }
463 1.1 cgd regparse++;
464 1.1 cgd if (ISMULT(*regparse))
465 1.1 cgd FAIL("nested *?+");
466 1.1 cgd
467 1.1 cgd return(ret);
468 1.1 cgd }
469 1.1 cgd
470 1.1 cgd /*
471 1.1 cgd - regatom - the lowest level
472 1.1 cgd *
473 1.1 cgd * Optimization: gobbles an entire sequence of ordinary characters so that
474 1.1 cgd * it can turn them into a single node, which is smaller to store and
475 1.1 cgd * faster to run. Backslashed characters are exceptions, each becoming a
476 1.1 cgd * separate node; the code is simpler that way and it's not worth fixing.
477 1.1 cgd */
478 1.1 cgd static char *
479 1.1 cgd regatom(flagp)
480 1.1 cgd int *flagp;
481 1.1 cgd {
482 1.8 perry char *ret;
483 1.1 cgd int flags;
484 1.1 cgd
485 1.1 cgd *flagp = WORST; /* Tentatively. */
486 1.1 cgd
487 1.1 cgd switch (*regparse++) {
488 1.1 cgd /* FIXME: these chars only have meaning at beg/end of pat? */
489 1.1 cgd case '^':
490 1.1 cgd ret = regnode(BOL);
491 1.1 cgd break;
492 1.1 cgd case '$':
493 1.1 cgd ret = regnode(EOL);
494 1.1 cgd break;
495 1.1 cgd case '.':
496 1.1 cgd ret = regnode(ANY);
497 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
498 1.1 cgd break;
499 1.1 cgd case '[': {
500 1.8 perry int class;
501 1.8 perry int classend;
502 1.1 cgd
503 1.1 cgd if (*regparse == '^') { /* Complement of range. */
504 1.1 cgd ret = regnode(ANYBUT);
505 1.1 cgd regparse++;
506 1.1 cgd } else
507 1.1 cgd ret = regnode(ANYOF);
508 1.1 cgd if (*regparse == ']' || *regparse == '-')
509 1.1 cgd regc(*regparse++);
510 1.1 cgd while (*regparse != '\0' && *regparse != ']') {
511 1.1 cgd if (*regparse == '-') {
512 1.1 cgd regparse++;
513 1.1 cgd if (*regparse == ']' || *regparse == '\0')
514 1.1 cgd regc('-');
515 1.1 cgd else {
516 1.1 cgd class = UCHARAT(regparse-2)+1;
517 1.1 cgd classend = UCHARAT(regparse);
518 1.1 cgd if (class > classend+1)
519 1.1 cgd FAIL("invalid [] range");
520 1.1 cgd for (; class <= classend; class++)
521 1.1 cgd regc(class);
522 1.1 cgd regparse++;
523 1.1 cgd }
524 1.1 cgd } else
525 1.1 cgd regc(*regparse++);
526 1.1 cgd }
527 1.1 cgd regc('\0');
528 1.1 cgd if (*regparse != ']')
529 1.1 cgd FAIL("unmatched []");
530 1.1 cgd regparse++;
531 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
532 1.1 cgd }
533 1.1 cgd break;
534 1.1 cgd case '(':
535 1.1 cgd ret = reg(1, &flags);
536 1.1 cgd if (ret == NULL)
537 1.1 cgd return(NULL);
538 1.1 cgd *flagp |= flags&(HASWIDTH|SPSTART);
539 1.1 cgd break;
540 1.1 cgd case '\0':
541 1.1 cgd case '|':
542 1.1 cgd case '\n':
543 1.1 cgd case ')':
544 1.1 cgd FAIL("internal urp"); /* Supposed to be caught earlier. */
545 1.1 cgd case '?':
546 1.1 cgd case '+':
547 1.1 cgd case '*':
548 1.1 cgd FAIL("?+* follows nothing");
549 1.1 cgd case '\\':
550 1.1 cgd switch (*regparse++) {
551 1.1 cgd case '\0':
552 1.1 cgd FAIL("trailing \\");
553 1.1 cgd case '<':
554 1.1 cgd ret = regnode(WORDA);
555 1.1 cgd break;
556 1.1 cgd case '>':
557 1.1 cgd ret = regnode(WORDZ);
558 1.1 cgd break;
559 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
560 1.1 cgd default:
561 1.1 cgd /* Handle general quoted chars in exact-match routine */
562 1.1 cgd goto de_fault;
563 1.1 cgd }
564 1.1 cgd break;
565 1.1 cgd de_fault:
566 1.10 christos /*FALLTHROUGH*/
567 1.1 cgd default:
568 1.1 cgd /*
569 1.1 cgd * Encode a string of characters to be matched exactly.
570 1.1 cgd *
571 1.1 cgd * This is a bit tricky due to quoted chars and due to
572 1.1 cgd * '*', '+', and '?' taking the SINGLE char previous
573 1.1 cgd * as their operand.
574 1.1 cgd *
575 1.1 cgd * On entry, the char at regparse[-1] is going to go
576 1.1 cgd * into the string, no matter what it is. (It could be
577 1.1 cgd * following a \ if we are entered from the '\' case.)
578 1.11 simonb *
579 1.1 cgd * Basic idea is to pick up a good char in ch and
580 1.1 cgd * examine the next char. If it's *+? then we twiddle.
581 1.1 cgd * If it's \ then we frozzle. If it's other magic char
582 1.1 cgd * we push ch and terminate the string. If none of the
583 1.1 cgd * above, we push ch on the string and go around again.
584 1.1 cgd *
585 1.1 cgd * regprev is used to remember where "the current char"
586 1.1 cgd * starts in the string, if due to a *+? we need to back
587 1.1 cgd * up and put the current char in a separate, 1-char, string.
588 1.1 cgd * When regprev is NULL, ch is the only char in the
589 1.1 cgd * string; this is used in *+? handling, and in setting
590 1.1 cgd * flags |= SIMPLE at the end.
591 1.1 cgd */
592 1.1 cgd {
593 1.1 cgd char *regprev;
594 1.8 perry char ch;
595 1.1 cgd
596 1.1 cgd regparse--; /* Look at cur char */
597 1.1 cgd ret = regnode(EXACTLY);
598 1.1 cgd for ( regprev = 0 ; ; ) {
599 1.1 cgd ch = *regparse++; /* Get current char */
600 1.1 cgd switch (*regparse) { /* look at next one */
601 1.1 cgd
602 1.1 cgd default:
603 1.1 cgd regc(ch); /* Add cur to string */
604 1.1 cgd break;
605 1.1 cgd
606 1.1 cgd case '.': case '[': case '(':
607 1.1 cgd case ')': case '|': case '\n':
608 1.1 cgd case '$': case '^':
609 1.1 cgd case '\0':
610 1.1 cgd /* FIXME, $ and ^ should not always be magic */
611 1.1 cgd magic:
612 1.1 cgd regc(ch); /* dump cur char */
613 1.1 cgd goto done; /* and we are done */
614 1.1 cgd
615 1.1 cgd case '?': case '+': case '*':
616 1.1 cgd if (!regprev) /* If just ch in str, */
617 1.1 cgd goto magic; /* use it */
618 1.1 cgd /* End mult-char string one early */
619 1.1 cgd regparse = regprev; /* Back up parse */
620 1.1 cgd goto done;
621 1.1 cgd
622 1.1 cgd case '\\':
623 1.1 cgd regc(ch); /* Cur char OK */
624 1.1 cgd switch (regparse[1]){ /* Look after \ */
625 1.1 cgd case '\0':
626 1.1 cgd case '<':
627 1.1 cgd case '>':
628 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
629 1.1 cgd goto done; /* Not quoted */
630 1.1 cgd default:
631 1.1 cgd /* Backup point is \, scan * point is after it. */
632 1.1 cgd regprev = regparse;
633 1.11 simonb regparse++;
634 1.1 cgd continue; /* NOT break; */
635 1.1 cgd }
636 1.1 cgd }
637 1.1 cgd regprev = regparse; /* Set backup point */
638 1.1 cgd }
639 1.1 cgd done:
640 1.1 cgd regc('\0');
641 1.1 cgd *flagp |= HASWIDTH;
642 1.1 cgd if (!regprev) /* One char? */
643 1.1 cgd *flagp |= SIMPLE;
644 1.1 cgd }
645 1.1 cgd break;
646 1.1 cgd }
647 1.1 cgd
648 1.1 cgd return(ret);
649 1.1 cgd }
650 1.1 cgd
651 1.1 cgd /*
652 1.1 cgd - regnode - emit a node
653 1.1 cgd */
654 1.1 cgd static char * /* Location. */
655 1.1 cgd regnode(op)
656 1.10 christos int op;
657 1.1 cgd {
658 1.8 perry char *ret;
659 1.8 perry char *ptr;
660 1.1 cgd
661 1.1 cgd ret = regcode;
662 1.1 cgd if (ret == ®dummy) {
663 1.1 cgd regsize += 3;
664 1.1 cgd return(ret);
665 1.1 cgd }
666 1.1 cgd
667 1.1 cgd ptr = ret;
668 1.1 cgd *ptr++ = op;
669 1.1 cgd *ptr++ = '\0'; /* Null "next" pointer. */
670 1.1 cgd *ptr++ = '\0';
671 1.1 cgd regcode = ptr;
672 1.1 cgd
673 1.1 cgd return(ret);
674 1.1 cgd }
675 1.1 cgd
676 1.1 cgd /*
677 1.1 cgd - regc - emit (if appropriate) a byte of code
678 1.1 cgd */
679 1.1 cgd static void
680 1.1 cgd regc(b)
681 1.10 christos int b;
682 1.1 cgd {
683 1.1 cgd if (regcode != ®dummy)
684 1.1 cgd *regcode++ = b;
685 1.1 cgd else
686 1.1 cgd regsize++;
687 1.1 cgd }
688 1.1 cgd
689 1.1 cgd /*
690 1.1 cgd - reginsert - insert an operator in front of already-emitted operand
691 1.1 cgd *
692 1.1 cgd * Means relocating the operand.
693 1.1 cgd */
694 1.1 cgd static void
695 1.1 cgd reginsert(op, opnd)
696 1.10 christos int op;
697 1.1 cgd char *opnd;
698 1.1 cgd {
699 1.8 perry char *src;
700 1.8 perry char *dst;
701 1.8 perry char *place;
702 1.1 cgd
703 1.1 cgd if (regcode == ®dummy) {
704 1.1 cgd regsize += 3;
705 1.1 cgd return;
706 1.1 cgd }
707 1.1 cgd
708 1.1 cgd src = regcode;
709 1.1 cgd regcode += 3;
710 1.1 cgd dst = regcode;
711 1.1 cgd while (src > opnd)
712 1.1 cgd *--dst = *--src;
713 1.1 cgd
714 1.1 cgd place = opnd; /* Op node, where operand used to be. */
715 1.1 cgd *place++ = op;
716 1.1 cgd *place++ = '\0';
717 1.1 cgd *place++ = '\0';
718 1.1 cgd }
719 1.1 cgd
720 1.1 cgd /*
721 1.1 cgd - regtail - set the next-pointer at the end of a node chain
722 1.1 cgd */
723 1.1 cgd static void
724 1.1 cgd regtail(p, val)
725 1.1 cgd char *p;
726 1.1 cgd char *val;
727 1.1 cgd {
728 1.8 perry char *scan;
729 1.8 perry char *temp;
730 1.8 perry int offset;
731 1.1 cgd
732 1.1 cgd if (p == ®dummy)
733 1.1 cgd return;
734 1.1 cgd
735 1.1 cgd /* Find last node. */
736 1.1 cgd scan = p;
737 1.1 cgd for (;;) {
738 1.1 cgd temp = regnext(scan);
739 1.1 cgd if (temp == NULL)
740 1.1 cgd break;
741 1.1 cgd scan = temp;
742 1.1 cgd }
743 1.1 cgd
744 1.1 cgd if (OP(scan) == BACK)
745 1.1 cgd offset = scan - val;
746 1.1 cgd else
747 1.1 cgd offset = val - scan;
748 1.10 christos *(scan+1) = ((unsigned int)offset>>8)&0377;
749 1.1 cgd *(scan+2) = offset&0377;
750 1.1 cgd }
751 1.1 cgd
752 1.1 cgd /*
753 1.1 cgd - regoptail - regtail on operand of first argument; nop if operandless
754 1.1 cgd */
755 1.1 cgd static void
756 1.1 cgd regoptail(p, val)
757 1.1 cgd char *p;
758 1.1 cgd char *val;
759 1.1 cgd {
760 1.1 cgd /* "Operandless" and "op != BRANCH" are synonymous in practice. */
761 1.1 cgd if (p == NULL || p == ®dummy || OP(p) != BRANCH)
762 1.1 cgd return;
763 1.1 cgd regtail(OPERAND(p), val);
764 1.1 cgd }
765 1.1 cgd
766 1.1 cgd /*
767 1.1 cgd * regexec and friends
768 1.1 cgd */
769 1.1 cgd
770 1.1 cgd /*
771 1.1 cgd * Global work variables for regexec().
772 1.1 cgd */
773 1.1 cgd static char *reginput; /* String-input pointer. */
774 1.1 cgd static char *regbol; /* Beginning of input, for ^ check. */
775 1.1 cgd static char **regstartp; /* Pointer to startp array. */
776 1.1 cgd static char **regendp; /* Ditto for endp. */
777 1.1 cgd
778 1.1 cgd /*
779 1.1 cgd * Forwards.
780 1.1 cgd */
781 1.4 pk STATIC int regtry __P((const regexp *, const char *));
782 1.4 pk STATIC int regmatch __P((char *));
783 1.4 pk STATIC int regrepeat __P((char *));
784 1.1 cgd
785 1.1 cgd #ifdef DEBUG
786 1.1 cgd int regnarrate = 0;
787 1.4 pk void regdump __P((regexp *));
788 1.4 pk STATIC char *regprop __P((char *));
789 1.1 cgd #endif
790 1.1 cgd
791 1.1 cgd /*
792 1.1 cgd - regexec - match a regexp against a string
793 1.1 cgd */
794 1.1 cgd int
795 1.9 tv __compat_regexec(prog, string)
796 1.8 perry const regexp *prog;
797 1.8 perry const char *string;
798 1.1 cgd {
799 1.8 perry char *s;
800 1.1 cgd
801 1.1 cgd /* Be paranoid... */
802 1.1 cgd if (prog == NULL || string == NULL) {
803 1.1 cgd regerror("NULL parameter");
804 1.1 cgd return(0);
805 1.1 cgd }
806 1.1 cgd
807 1.1 cgd /* Check validity of program. */
808 1.1 cgd if (UCHARAT(prog->program) != MAGIC) {
809 1.1 cgd regerror("corrupted program");
810 1.1 cgd return(0);
811 1.1 cgd }
812 1.1 cgd
813 1.1 cgd /* If there is a "must appear" string, look for it. */
814 1.1 cgd if (prog->regmust != NULL) {
815 1.10 christos /* LINTED const castaway */
816 1.1 cgd s = (char *)string;
817 1.1 cgd while ((s = strchr(s, prog->regmust[0])) != NULL) {
818 1.10 christos if (strncmp(s, prog->regmust,
819 1.10 christos (size_t)prog->regmlen) == 0)
820 1.1 cgd break; /* Found it. */
821 1.1 cgd s++;
822 1.1 cgd }
823 1.1 cgd if (s == NULL) /* Not present. */
824 1.1 cgd return(0);
825 1.1 cgd }
826 1.1 cgd
827 1.1 cgd /* Mark beginning of line for ^ . */
828 1.10 christos /* LINTED const castaway */
829 1.1 cgd regbol = (char *)string;
830 1.1 cgd
831 1.1 cgd /* Simplest case: anchored match need be tried only once. */
832 1.1 cgd if (prog->reganch)
833 1.1 cgd return(regtry(prog, string));
834 1.1 cgd
835 1.1 cgd /* Messy cases: unanchored match. */
836 1.10 christos /* LINTED const castaway */
837 1.1 cgd s = (char *)string;
838 1.1 cgd if (prog->regstart != '\0')
839 1.1 cgd /* We know what char it must start with. */
840 1.1 cgd while ((s = strchr(s, prog->regstart)) != NULL) {
841 1.1 cgd if (regtry(prog, s))
842 1.1 cgd return(1);
843 1.1 cgd s++;
844 1.1 cgd }
845 1.1 cgd else
846 1.1 cgd /* We don't -- general case. */
847 1.1 cgd do {
848 1.1 cgd if (regtry(prog, s))
849 1.1 cgd return(1);
850 1.1 cgd } while (*s++ != '\0');
851 1.1 cgd
852 1.1 cgd /* Failure. */
853 1.1 cgd return(0);
854 1.1 cgd }
855 1.1 cgd
856 1.1 cgd /*
857 1.1 cgd - regtry - try match at specific point
858 1.1 cgd */
859 1.1 cgd static int /* 0 failure, 1 success */
860 1.1 cgd regtry(prog, string)
861 1.4 pk const regexp *prog;
862 1.4 pk const char *string;
863 1.1 cgd {
864 1.8 perry int i;
865 1.8 perry char **sp;
866 1.8 perry char **ep;
867 1.1 cgd
868 1.10 christos /* LINTED const castaway */
869 1.5 cgd reginput = (char *)string; /* XXX */
870 1.5 cgd regstartp = (char **)prog->startp; /* XXX */
871 1.5 cgd regendp = (char **)prog->endp; /* XXX */
872 1.1 cgd
873 1.5 cgd sp = (char **)prog->startp; /* XXX */
874 1.5 cgd ep = (char **)prog->endp; /* XXX */
875 1.1 cgd for (i = NSUBEXP; i > 0; i--) {
876 1.1 cgd *sp++ = NULL;
877 1.1 cgd *ep++ = NULL;
878 1.1 cgd }
879 1.5 cgd if (regmatch((char *)prog->program + 1)) { /* XXX */
880 1.10 christos /* LINTED const castaway */
881 1.5 cgd ((regexp *)prog)->startp[0] = (char *)string; /* XXX */
882 1.10 christos /* LINTED const castaway */
883 1.5 cgd ((regexp *)prog)->endp[0] = reginput; /* XXX */
884 1.1 cgd return(1);
885 1.1 cgd } else
886 1.1 cgd return(0);
887 1.1 cgd }
888 1.1 cgd
889 1.1 cgd /*
890 1.1 cgd - regmatch - main matching routine
891 1.1 cgd *
892 1.1 cgd * Conceptually the strategy is simple: check to see whether the current
893 1.1 cgd * node matches, call self recursively to see whether the rest matches,
894 1.1 cgd * and then act accordingly. In practice we make some effort to avoid
895 1.1 cgd * recursion, in particular by going through "ordinary" nodes (that don't
896 1.1 cgd * need to know whether the rest of the match failed) by a loop instead of
897 1.1 cgd * by recursion.
898 1.1 cgd */
899 1.1 cgd static int /* 0 failure, 1 success */
900 1.1 cgd regmatch(prog)
901 1.1 cgd char *prog;
902 1.1 cgd {
903 1.8 perry char *scan; /* Current node. */
904 1.1 cgd char *next; /* Next node. */
905 1.1 cgd
906 1.1 cgd scan = prog;
907 1.1 cgd #ifdef DEBUG
908 1.1 cgd if (scan != NULL && regnarrate)
909 1.1 cgd fprintf(stderr, "%s(\n", regprop(scan));
910 1.1 cgd #endif
911 1.1 cgd while (scan != NULL) {
912 1.1 cgd #ifdef DEBUG
913 1.1 cgd if (regnarrate)
914 1.1 cgd fprintf(stderr, "%s...\n", regprop(scan));
915 1.1 cgd #endif
916 1.1 cgd next = regnext(scan);
917 1.1 cgd
918 1.1 cgd switch (OP(scan)) {
919 1.1 cgd case BOL:
920 1.1 cgd if (reginput != regbol)
921 1.1 cgd return(0);
922 1.1 cgd break;
923 1.1 cgd case EOL:
924 1.1 cgd if (*reginput != '\0')
925 1.1 cgd return(0);
926 1.1 cgd break;
927 1.1 cgd case WORDA:
928 1.1 cgd /* Must be looking at a letter, digit, or _ */
929 1.13 itohy if ((!isalnum(UCHARAT(reginput))) && *reginput != '_')
930 1.1 cgd return(0);
931 1.1 cgd /* Prev must be BOL or nonword */
932 1.1 cgd if (reginput > regbol &&
933 1.13 itohy (isalnum(UCHARAT(reginput - 1))
934 1.13 itohy || reginput[-1] == '_'))
935 1.1 cgd return(0);
936 1.1 cgd break;
937 1.1 cgd case WORDZ:
938 1.1 cgd /* Must be looking at non letter, digit, or _ */
939 1.13 itohy if (isalnum(UCHARAT(reginput)) || *reginput == '_')
940 1.1 cgd return(0);
941 1.1 cgd /* We don't care what the previous char was */
942 1.1 cgd break;
943 1.1 cgd case ANY:
944 1.1 cgd if (*reginput == '\0')
945 1.1 cgd return(0);
946 1.1 cgd reginput++;
947 1.1 cgd break;
948 1.1 cgd case EXACTLY: {
949 1.8 perry int len;
950 1.8 perry char *opnd;
951 1.1 cgd
952 1.1 cgd opnd = OPERAND(scan);
953 1.1 cgd /* Inline the first character, for speed. */
954 1.1 cgd if (*opnd != *reginput)
955 1.1 cgd return(0);
956 1.1 cgd len = strlen(opnd);
957 1.11 simonb if (len > 1 && strncmp(opnd, reginput,
958 1.10 christos (size_t)len) != 0)
959 1.1 cgd return(0);
960 1.1 cgd reginput += len;
961 1.1 cgd }
962 1.1 cgd break;
963 1.1 cgd case ANYOF:
964 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
965 1.1 cgd return(0);
966 1.1 cgd reginput++;
967 1.1 cgd break;
968 1.1 cgd case ANYBUT:
969 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
970 1.1 cgd return(0);
971 1.1 cgd reginput++;
972 1.1 cgd break;
973 1.1 cgd case NOTHING:
974 1.1 cgd break;
975 1.1 cgd case BACK:
976 1.1 cgd break;
977 1.1 cgd case OPEN+1:
978 1.1 cgd case OPEN+2:
979 1.1 cgd case OPEN+3:
980 1.1 cgd case OPEN+4:
981 1.1 cgd case OPEN+5:
982 1.1 cgd case OPEN+6:
983 1.1 cgd case OPEN+7:
984 1.1 cgd case OPEN+8:
985 1.1 cgd case OPEN+9: {
986 1.8 perry int no;
987 1.8 perry char *save;
988 1.1 cgd
989 1.1 cgd no = OP(scan) - OPEN;
990 1.1 cgd save = reginput;
991 1.1 cgd
992 1.1 cgd if (regmatch(next)) {
993 1.1 cgd /*
994 1.1 cgd * Don't set startp if some later
995 1.1 cgd * invocation of the same parentheses
996 1.1 cgd * already has.
997 1.1 cgd */
998 1.1 cgd if (regstartp[no] == NULL)
999 1.1 cgd regstartp[no] = save;
1000 1.1 cgd return(1);
1001 1.1 cgd } else
1002 1.1 cgd return(0);
1003 1.1 cgd }
1004 1.1 cgd case CLOSE+1:
1005 1.1 cgd case CLOSE+2:
1006 1.1 cgd case CLOSE+3:
1007 1.1 cgd case CLOSE+4:
1008 1.1 cgd case CLOSE+5:
1009 1.1 cgd case CLOSE+6:
1010 1.1 cgd case CLOSE+7:
1011 1.1 cgd case CLOSE+8:
1012 1.1 cgd case CLOSE+9: {
1013 1.8 perry int no;
1014 1.8 perry char *save;
1015 1.1 cgd
1016 1.1 cgd no = OP(scan) - CLOSE;
1017 1.1 cgd save = reginput;
1018 1.1 cgd
1019 1.1 cgd if (regmatch(next)) {
1020 1.1 cgd /*
1021 1.1 cgd * Don't set endp if some later
1022 1.1 cgd * invocation of the same parentheses
1023 1.1 cgd * already has.
1024 1.1 cgd */
1025 1.1 cgd if (regendp[no] == NULL)
1026 1.1 cgd regendp[no] = save;
1027 1.1 cgd return(1);
1028 1.1 cgd } else
1029 1.1 cgd return(0);
1030 1.1 cgd }
1031 1.1 cgd case BRANCH: {
1032 1.8 perry char *save;
1033 1.1 cgd
1034 1.1 cgd if (OP(next) != BRANCH) /* No choice. */
1035 1.1 cgd next = OPERAND(scan); /* Avoid recursion. */
1036 1.1 cgd else {
1037 1.1 cgd do {
1038 1.1 cgd save = reginput;
1039 1.1 cgd if (regmatch(OPERAND(scan)))
1040 1.1 cgd return(1);
1041 1.1 cgd reginput = save;
1042 1.1 cgd scan = regnext(scan);
1043 1.1 cgd } while (scan != NULL && OP(scan) == BRANCH);
1044 1.1 cgd return(0);
1045 1.1 cgd /* NOTREACHED */
1046 1.1 cgd }
1047 1.1 cgd }
1048 1.1 cgd break;
1049 1.1 cgd case STAR:
1050 1.1 cgd case PLUS: {
1051 1.8 perry char nextch;
1052 1.8 perry int no;
1053 1.8 perry char *save;
1054 1.8 perry int min;
1055 1.1 cgd
1056 1.1 cgd /*
1057 1.1 cgd * Lookahead to avoid useless match attempts
1058 1.1 cgd * when we know what character comes next.
1059 1.1 cgd */
1060 1.1 cgd nextch = '\0';
1061 1.1 cgd if (OP(next) == EXACTLY)
1062 1.1 cgd nextch = *OPERAND(next);
1063 1.1 cgd min = (OP(scan) == STAR) ? 0 : 1;
1064 1.1 cgd save = reginput;
1065 1.1 cgd no = regrepeat(OPERAND(scan));
1066 1.1 cgd while (no >= min) {
1067 1.1 cgd /* If it could work, try it. */
1068 1.1 cgd if (nextch == '\0' || *reginput == nextch)
1069 1.1 cgd if (regmatch(next))
1070 1.1 cgd return(1);
1071 1.1 cgd /* Couldn't or didn't -- back up. */
1072 1.1 cgd no--;
1073 1.1 cgd reginput = save + no;
1074 1.1 cgd }
1075 1.1 cgd return(0);
1076 1.1 cgd }
1077 1.1 cgd case END:
1078 1.1 cgd return(1); /* Success! */
1079 1.1 cgd default:
1080 1.1 cgd regerror("memory corruption");
1081 1.1 cgd return(0);
1082 1.1 cgd }
1083 1.1 cgd
1084 1.1 cgd scan = next;
1085 1.1 cgd }
1086 1.1 cgd
1087 1.1 cgd /*
1088 1.1 cgd * We get here only if there's trouble -- normally "case END" is
1089 1.1 cgd * the terminating point.
1090 1.1 cgd */
1091 1.1 cgd regerror("corrupted pointers");
1092 1.1 cgd return(0);
1093 1.1 cgd }
1094 1.1 cgd
1095 1.1 cgd /*
1096 1.1 cgd - regrepeat - repeatedly match something simple, report how many
1097 1.1 cgd */
1098 1.1 cgd static int
1099 1.1 cgd regrepeat(p)
1100 1.1 cgd char *p;
1101 1.1 cgd {
1102 1.8 perry int count = 0;
1103 1.8 perry char *scan;
1104 1.8 perry char *opnd;
1105 1.1 cgd
1106 1.1 cgd scan = reginput;
1107 1.1 cgd opnd = OPERAND(p);
1108 1.1 cgd switch (OP(p)) {
1109 1.1 cgd case ANY:
1110 1.1 cgd count = strlen(scan);
1111 1.1 cgd scan += count;
1112 1.1 cgd break;
1113 1.1 cgd case EXACTLY:
1114 1.1 cgd while (*opnd == *scan) {
1115 1.1 cgd count++;
1116 1.1 cgd scan++;
1117 1.1 cgd }
1118 1.1 cgd break;
1119 1.1 cgd case ANYOF:
1120 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1121 1.1 cgd count++;
1122 1.1 cgd scan++;
1123 1.1 cgd }
1124 1.1 cgd break;
1125 1.1 cgd case ANYBUT:
1126 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1127 1.1 cgd count++;
1128 1.1 cgd scan++;
1129 1.1 cgd }
1130 1.1 cgd break;
1131 1.1 cgd default: /* Oh dear. Called inappropriately. */
1132 1.1 cgd regerror("internal foulup");
1133 1.1 cgd count = 0; /* Best compromise. */
1134 1.1 cgd break;
1135 1.1 cgd }
1136 1.1 cgd reginput = scan;
1137 1.1 cgd
1138 1.1 cgd return(count);
1139 1.1 cgd }
1140 1.1 cgd
1141 1.1 cgd /*
1142 1.1 cgd - regnext - dig the "next" pointer out of a node
1143 1.1 cgd */
1144 1.1 cgd static char *
1145 1.1 cgd regnext(p)
1146 1.8 perry char *p;
1147 1.1 cgd {
1148 1.8 perry int offset;
1149 1.1 cgd
1150 1.1 cgd if (p == ®dummy)
1151 1.1 cgd return(NULL);
1152 1.1 cgd
1153 1.1 cgd offset = NEXT(p);
1154 1.1 cgd if (offset == 0)
1155 1.1 cgd return(NULL);
1156 1.1 cgd
1157 1.1 cgd if (OP(p) == BACK)
1158 1.1 cgd return(p-offset);
1159 1.1 cgd else
1160 1.1 cgd return(p+offset);
1161 1.1 cgd }
1162 1.1 cgd
1163 1.1 cgd #ifdef DEBUG
1164 1.1 cgd
1165 1.1 cgd /*
1166 1.1 cgd - regdump - dump a regexp onto stdout in vaguely comprehensible form
1167 1.1 cgd */
1168 1.1 cgd void
1169 1.1 cgd regdump(r)
1170 1.1 cgd regexp *r;
1171 1.1 cgd {
1172 1.8 perry char *s;
1173 1.8 perry char op = EXACTLY; /* Arbitrary non-END op. */
1174 1.8 perry char *next;
1175 1.1 cgd
1176 1.1 cgd
1177 1.1 cgd s = r->program + 1;
1178 1.1 cgd while (op != END) { /* While that wasn't END last time... */
1179 1.1 cgd op = OP(s);
1180 1.18 freza printf("%2td%s", s-r->program, regprop(s)); /* Where, what. */
1181 1.1 cgd next = regnext(s);
1182 1.1 cgd if (next == NULL) /* Next ptr. */
1183 1.1 cgd printf("(0)");
1184 1.11 simonb else
1185 1.18 freza printf("(%td)", (s-r->program)+(next-s));
1186 1.1 cgd s += 3;
1187 1.1 cgd if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1188 1.1 cgd /* Literal string, where present. */
1189 1.1 cgd while (*s != '\0') {
1190 1.1 cgd putchar(*s);
1191 1.1 cgd s++;
1192 1.1 cgd }
1193 1.1 cgd s++;
1194 1.1 cgd }
1195 1.1 cgd putchar('\n');
1196 1.1 cgd }
1197 1.1 cgd
1198 1.1 cgd /* Header fields of interest. */
1199 1.1 cgd if (r->regstart != '\0')
1200 1.1 cgd printf("start `%c' ", r->regstart);
1201 1.1 cgd if (r->reganch)
1202 1.1 cgd printf("anchored ");
1203 1.1 cgd if (r->regmust != NULL)
1204 1.1 cgd printf("must have \"%s\"", r->regmust);
1205 1.1 cgd printf("\n");
1206 1.1 cgd }
1207 1.1 cgd
1208 1.1 cgd /*
1209 1.1 cgd - regprop - printable representation of opcode
1210 1.1 cgd */
1211 1.1 cgd static char *
1212 1.1 cgd regprop(op)
1213 1.1 cgd char *op;
1214 1.1 cgd {
1215 1.8 perry char *p;
1216 1.1 cgd static char buf[50];
1217 1.1 cgd
1218 1.6 mrg (void)strncpy(buf, ":", sizeof(buf) - 1);
1219 1.1 cgd
1220 1.1 cgd switch (OP(op)) {
1221 1.1 cgd case BOL:
1222 1.1 cgd p = "BOL";
1223 1.1 cgd break;
1224 1.1 cgd case EOL:
1225 1.1 cgd p = "EOL";
1226 1.1 cgd break;
1227 1.1 cgd case ANY:
1228 1.1 cgd p = "ANY";
1229 1.1 cgd break;
1230 1.1 cgd case ANYOF:
1231 1.1 cgd p = "ANYOF";
1232 1.1 cgd break;
1233 1.1 cgd case ANYBUT:
1234 1.1 cgd p = "ANYBUT";
1235 1.1 cgd break;
1236 1.1 cgd case BRANCH:
1237 1.1 cgd p = "BRANCH";
1238 1.1 cgd break;
1239 1.1 cgd case EXACTLY:
1240 1.1 cgd p = "EXACTLY";
1241 1.1 cgd break;
1242 1.1 cgd case NOTHING:
1243 1.1 cgd p = "NOTHING";
1244 1.1 cgd break;
1245 1.1 cgd case BACK:
1246 1.1 cgd p = "BACK";
1247 1.1 cgd break;
1248 1.1 cgd case END:
1249 1.1 cgd p = "END";
1250 1.1 cgd break;
1251 1.1 cgd case OPEN+1:
1252 1.1 cgd case OPEN+2:
1253 1.1 cgd case OPEN+3:
1254 1.1 cgd case OPEN+4:
1255 1.1 cgd case OPEN+5:
1256 1.1 cgd case OPEN+6:
1257 1.1 cgd case OPEN+7:
1258 1.1 cgd case OPEN+8:
1259 1.1 cgd case OPEN+9:
1260 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1261 1.6 mrg "OPEN%d", OP(op)-OPEN);
1262 1.1 cgd p = NULL;
1263 1.1 cgd break;
1264 1.1 cgd case CLOSE+1:
1265 1.1 cgd case CLOSE+2:
1266 1.1 cgd case CLOSE+3:
1267 1.1 cgd case CLOSE+4:
1268 1.1 cgd case CLOSE+5:
1269 1.1 cgd case CLOSE+6:
1270 1.1 cgd case CLOSE+7:
1271 1.1 cgd case CLOSE+8:
1272 1.1 cgd case CLOSE+9:
1273 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1274 1.6 mrg "CLOSE%d", OP(op)-CLOSE);
1275 1.1 cgd p = NULL;
1276 1.1 cgd break;
1277 1.1 cgd case STAR:
1278 1.1 cgd p = "STAR";
1279 1.1 cgd break;
1280 1.1 cgd case PLUS:
1281 1.1 cgd p = "PLUS";
1282 1.1 cgd break;
1283 1.1 cgd case WORDA:
1284 1.1 cgd p = "WORDA";
1285 1.1 cgd break;
1286 1.1 cgd case WORDZ:
1287 1.1 cgd p = "WORDZ";
1288 1.1 cgd break;
1289 1.1 cgd default:
1290 1.17 christos p = NULL;
1291 1.1 cgd regerror("corrupted opcode");
1292 1.1 cgd break;
1293 1.1 cgd }
1294 1.1 cgd if (p != NULL)
1295 1.6 mrg (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1);
1296 1.1 cgd return(buf);
1297 1.1 cgd }
1298 1.1 cgd #endif
1299 1.1 cgd
1300 1.1 cgd /*
1301 1.1 cgd * The following is provided for those people who do not have strcspn() in
1302 1.1 cgd * their C libraries. They should get off their butts and do something
1303 1.1 cgd * about it; at least one public-domain implementation of those (highly
1304 1.1 cgd * useful) string routines has been published on Usenet.
1305 1.1 cgd */
1306 1.1 cgd #ifdef STRCSPN
1307 1.1 cgd /*
1308 1.1 cgd * strcspn - find length of initial segment of s1 consisting entirely
1309 1.1 cgd * of characters not from s2
1310 1.1 cgd */
1311 1.1 cgd
1312 1.1 cgd static int
1313 1.1 cgd strcspn(s1, s2)
1314 1.1 cgd char *s1;
1315 1.1 cgd char *s2;
1316 1.1 cgd {
1317 1.8 perry char *scan1;
1318 1.8 perry char *scan2;
1319 1.8 perry int count;
1320 1.1 cgd
1321 1.1 cgd count = 0;
1322 1.1 cgd for (scan1 = s1; *scan1 != '\0'; scan1++) {
1323 1.1 cgd for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1324 1.1 cgd if (*scan1 == *scan2++)
1325 1.1 cgd return(count);
1326 1.1 cgd count++;
1327 1.1 cgd }
1328 1.1 cgd return(count);
1329 1.1 cgd }
1330 1.1 cgd #endif
1331