regexp.c revision 1.13 1 1.1 cgd /*
2 1.1 cgd * regcomp and regexec -- regsub and regerror are elsewhere
3 1.1 cgd *
4 1.1 cgd * Copyright (c) 1986 by University of Toronto.
5 1.1 cgd * Written by Henry Spencer. Not derived from licensed software.
6 1.1 cgd *
7 1.1 cgd * Permission is granted to anyone to use this software for any
8 1.1 cgd * purpose on any computer system, and to redistribute it freely,
9 1.1 cgd * subject to the following restrictions:
10 1.1 cgd *
11 1.1 cgd * 1. The author is not responsible for the consequences of use of
12 1.1 cgd * this software, no matter how awful, even if they arise
13 1.1 cgd * from defects in it.
14 1.1 cgd *
15 1.1 cgd * 2. The origin of this software must not be misrepresented, either
16 1.1 cgd * by explicit claim or by omission.
17 1.1 cgd *
18 1.1 cgd * 3. Altered versions must be plainly marked as such, and must not
19 1.1 cgd * be misrepresented as being the original software.
20 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 1.1 cgd *** to assist in implementing egrep.
23 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 1.1 cgd *** as in BSD grep and ex.
26 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 1.1 cgd *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 1.1 cgd *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 1.1 cgd *
31 1.1 cgd * Beware that some of this code is subtly aware of the way operator
32 1.1 cgd * precedence is structured in regular expressions. Serious changes in
33 1.1 cgd * regular-expression syntax might require a total rethink.
34 1.1 cgd */
35 1.2 mycroft
36 1.7 lukem #include <sys/cdefs.h>
37 1.2 mycroft #ifndef lint
38 1.13 itohy __RCSID("$NetBSD: regexp.c,v 1.13 2000/07/11 06:07:25 itohy Exp $");
39 1.2 mycroft #endif /* not lint */
40 1.2 mycroft
41 1.12 lukem #include <ctype.h>
42 1.1 cgd #include <regexp.h>
43 1.1 cgd #include <stdio.h>
44 1.1 cgd #include <stdlib.h>
45 1.1 cgd #include <string.h>
46 1.1 cgd #include "regmagic.h"
47 1.1 cgd
48 1.1 cgd /*
49 1.1 cgd * The "internal use only" fields in regexp.h are present to pass info from
50 1.1 cgd * compile to execute that permits the execute phase to run lots faster on
51 1.1 cgd * simple cases. They are:
52 1.1 cgd *
53 1.1 cgd * regstart char that must begin a match; '\0' if none obvious
54 1.1 cgd * reganch is the match anchored (at beginning-of-line only)?
55 1.1 cgd * regmust string (pointer into program) that match must include, or NULL
56 1.1 cgd * regmlen length of regmust string
57 1.1 cgd *
58 1.1 cgd * Regstart and reganch permit very fast decisions on suitable starting points
59 1.1 cgd * for a match, cutting down the work a lot. Regmust permits fast rejection
60 1.1 cgd * of lines that cannot possibly match. The regmust tests are costly enough
61 1.1 cgd * that regcomp() supplies a regmust only if the r.e. contains something
62 1.1 cgd * potentially expensive (at present, the only such thing detected is * or +
63 1.1 cgd * at the start of the r.e., which can involve a lot of backup). Regmlen is
64 1.1 cgd * supplied because the test in regexec() needs it and regcomp() is computing
65 1.1 cgd * it anyway.
66 1.1 cgd */
67 1.1 cgd
68 1.1 cgd /*
69 1.1 cgd * Structure for regexp "program". This is essentially a linear encoding
70 1.1 cgd * of a nondeterministic finite-state machine (aka syntax charts or
71 1.1 cgd * "railroad normal form" in parsing technology). Each node is an opcode
72 1.1 cgd * plus a "next" pointer, possibly plus an operand. "Next" pointers of
73 1.1 cgd * all nodes except BRANCH implement concatenation; a "next" pointer with
74 1.1 cgd * a BRANCH on both ends of it is connecting two alternatives. (Here we
75 1.1 cgd * have one of the subtle syntax dependencies: an individual BRANCH (as
76 1.1 cgd * opposed to a collection of them) is never concatenated with anything
77 1.1 cgd * because of operator precedence.) The operand of some types of node is
78 1.1 cgd * a literal string; for others, it is a node leading into a sub-FSM. In
79 1.1 cgd * particular, the operand of a BRANCH node is the first node of the branch.
80 1.1 cgd * (NB this is *not* a tree structure: the tail of the branch connects
81 1.1 cgd * to the thing following the set of BRANCHes.) The opcodes are:
82 1.1 cgd */
83 1.1 cgd
84 1.1 cgd /* definition number opnd? meaning */
85 1.1 cgd #define END 0 /* no End of program. */
86 1.1 cgd #define BOL 1 /* no Match "" at beginning of line. */
87 1.1 cgd #define EOL 2 /* no Match "" at end of line. */
88 1.1 cgd #define ANY 3 /* no Match any one character. */
89 1.1 cgd #define ANYOF 4 /* str Match any character in this string. */
90 1.1 cgd #define ANYBUT 5 /* str Match any character not in this string. */
91 1.1 cgd #define BRANCH 6 /* node Match this alternative, or the next... */
92 1.1 cgd #define BACK 7 /* no Match "", "next" ptr points backward. */
93 1.1 cgd #define EXACTLY 8 /* str Match this string. */
94 1.1 cgd #define NOTHING 9 /* no Match empty string. */
95 1.1 cgd #define STAR 10 /* node Match this (simple) thing 0 or more times. */
96 1.1 cgd #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
97 1.1 cgd #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
98 1.1 cgd #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
99 1.1 cgd #define OPEN 20 /* no Mark this point in input as start of #n. */
100 1.1 cgd /* OPEN+1 is number 1, etc. */
101 1.1 cgd #define CLOSE 30 /* no Analogous to OPEN. */
102 1.1 cgd
103 1.1 cgd /*
104 1.1 cgd * Opcode notes:
105 1.1 cgd *
106 1.1 cgd * BRANCH The set of branches constituting a single choice are hooked
107 1.1 cgd * together with their "next" pointers, since precedence prevents
108 1.1 cgd * anything being concatenated to any individual branch. The
109 1.1 cgd * "next" pointer of the last BRANCH in a choice points to the
110 1.1 cgd * thing following the whole choice. This is also where the
111 1.1 cgd * final "next" pointer of each individual branch points; each
112 1.1 cgd * branch starts with the operand node of a BRANCH node.
113 1.1 cgd *
114 1.1 cgd * BACK Normal "next" pointers all implicitly point forward; BACK
115 1.1 cgd * exists to make loop structures possible.
116 1.1 cgd *
117 1.1 cgd * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
118 1.1 cgd * BRANCH structures using BACK. Simple cases (one character
119 1.1 cgd * per match) are implemented with STAR and PLUS for speed
120 1.1 cgd * and to minimize recursive plunges.
121 1.1 cgd *
122 1.1 cgd * OPEN,CLOSE ...are numbered at compile time.
123 1.1 cgd */
124 1.1 cgd
125 1.1 cgd /*
126 1.1 cgd * A node is one char of opcode followed by two chars of "next" pointer.
127 1.1 cgd * "Next" pointers are stored as two 8-bit pieces, high order first. The
128 1.1 cgd * value is a positive offset from the opcode of the node containing it.
129 1.1 cgd * An operand, if any, simply follows the node. (Note that much of the
130 1.1 cgd * code generation knows about this implicit relationship.)
131 1.1 cgd *
132 1.1 cgd * Using two bytes for the "next" pointer is vast overkill for most things,
133 1.1 cgd * but allows patterns to get big without disasters.
134 1.1 cgd */
135 1.1 cgd #define OP(p) (*(p))
136 1.1 cgd #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
137 1.1 cgd #define OPERAND(p) ((p) + 3)
138 1.1 cgd
139 1.1 cgd /*
140 1.1 cgd * See regmagic.h for one further detail of program structure.
141 1.1 cgd */
142 1.1 cgd
143 1.1 cgd
144 1.1 cgd /*
145 1.1 cgd * Utility definitions.
146 1.1 cgd */
147 1.1 cgd #ifndef CHARBITS
148 1.1 cgd #define UCHARAT(p) ((int)*(unsigned char *)(p))
149 1.1 cgd #else
150 1.1 cgd #define UCHARAT(p) ((int)*(p)&CHARBITS)
151 1.1 cgd #endif
152 1.1 cgd
153 1.1 cgd #define FAIL(m) { regerror(m); return(NULL); }
154 1.1 cgd #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
155 1.1 cgd
156 1.1 cgd /*
157 1.1 cgd * Flags to be passed up and down.
158 1.1 cgd */
159 1.1 cgd #define HASWIDTH 01 /* Known never to match null string. */
160 1.1 cgd #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
161 1.1 cgd #define SPSTART 04 /* Starts with * or +. */
162 1.1 cgd #define WORST 0 /* Worst case. */
163 1.1 cgd
164 1.1 cgd /*
165 1.1 cgd * Global work variables for regcomp().
166 1.1 cgd */
167 1.1 cgd static char *regparse; /* Input-scan pointer. */
168 1.1 cgd static int regnpar; /* () count. */
169 1.1 cgd static char regdummy;
170 1.1 cgd static char *regcode; /* Code-emit pointer; ®dummy = don't. */
171 1.1 cgd static long regsize; /* Code size. */
172 1.1 cgd
173 1.1 cgd /*
174 1.1 cgd * Forward declarations for regcomp()'s friends.
175 1.1 cgd */
176 1.1 cgd #ifndef STATIC
177 1.1 cgd #define STATIC static
178 1.1 cgd #endif
179 1.4 pk STATIC char *reg __P((int, int *));
180 1.4 pk STATIC char *regbranch __P((int *));
181 1.4 pk STATIC char *regpiece __P((int *));
182 1.4 pk STATIC char *regatom __P((int *));
183 1.10 christos STATIC char *regnode __P((int));
184 1.4 pk STATIC char *regnext __P((char *));
185 1.10 christos STATIC void regc __P((int));
186 1.10 christos STATIC void reginsert __P((int, char *));
187 1.4 pk STATIC void regtail __P((char *, char *));
188 1.4 pk STATIC void regoptail __P((char *, char *));
189 1.1 cgd #ifdef STRCSPN
190 1.4 pk STATIC int strcspn __P((char *, char *));
191 1.1 cgd #endif
192 1.1 cgd
193 1.1 cgd /*
194 1.1 cgd - regcomp - compile a regular expression into internal code
195 1.1 cgd *
196 1.1 cgd * We can't allocate space until we know how big the compiled form will be,
197 1.1 cgd * but we can't compile it (and thus know how big it is) until we've got a
198 1.1 cgd * place to put the code. So we cheat: we compile it twice, once with code
199 1.1 cgd * generation turned off and size counting turned on, and once "for real".
200 1.1 cgd * This also means that we don't allocate space until we are sure that the
201 1.1 cgd * thing really will compile successfully, and we never have to move the
202 1.1 cgd * code and thus invalidate pointers into it. (Note that it has to be in
203 1.1 cgd * one piece because free() must be able to free it all.)
204 1.1 cgd *
205 1.1 cgd * Beware that the optimization-preparation code in here knows about some
206 1.1 cgd * of the structure of the compiled regexp.
207 1.1 cgd */
208 1.1 cgd regexp *
209 1.9 tv __compat_regcomp(exp)
210 1.1 cgd const char *exp;
211 1.1 cgd {
212 1.8 perry regexp *r;
213 1.8 perry char *scan;
214 1.8 perry char *longest;
215 1.8 perry int len;
216 1.1 cgd int flags;
217 1.1 cgd
218 1.1 cgd if (exp == NULL)
219 1.1 cgd FAIL("NULL argument");
220 1.1 cgd
221 1.1 cgd /* First pass: determine size, legality. */
222 1.1 cgd #ifdef notdef
223 1.1 cgd if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
224 1.1 cgd #endif
225 1.10 christos /* LINTED const castaway */
226 1.1 cgd regparse = (char *)exp;
227 1.1 cgd regnpar = 1;
228 1.1 cgd regsize = 0L;
229 1.1 cgd regcode = ®dummy;
230 1.1 cgd regc(MAGIC);
231 1.1 cgd if (reg(0, &flags) == NULL)
232 1.1 cgd return(NULL);
233 1.1 cgd
234 1.1 cgd /* Small enough for pointer-storage convention? */
235 1.1 cgd if (regsize >= 32767L) /* Probably could be 65535L. */
236 1.1 cgd FAIL("regexp too big");
237 1.1 cgd
238 1.1 cgd /* Allocate space. */
239 1.1 cgd r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
240 1.1 cgd if (r == NULL)
241 1.1 cgd FAIL("out of space");
242 1.1 cgd
243 1.1 cgd /* Second pass: emit code. */
244 1.10 christos /* LINTED const castaway */
245 1.1 cgd regparse = (char *)exp;
246 1.1 cgd regnpar = 1;
247 1.1 cgd regcode = r->program;
248 1.1 cgd regc(MAGIC);
249 1.1 cgd if (reg(0, &flags) == NULL)
250 1.1 cgd return(NULL);
251 1.1 cgd
252 1.1 cgd /* Dig out information for optimizations. */
253 1.1 cgd r->regstart = '\0'; /* Worst-case defaults. */
254 1.1 cgd r->reganch = 0;
255 1.1 cgd r->regmust = NULL;
256 1.1 cgd r->regmlen = 0;
257 1.1 cgd scan = r->program+1; /* First BRANCH. */
258 1.1 cgd if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
259 1.1 cgd scan = OPERAND(scan);
260 1.1 cgd
261 1.1 cgd /* Starting-point info. */
262 1.1 cgd if (OP(scan) == EXACTLY)
263 1.1 cgd r->regstart = *OPERAND(scan);
264 1.1 cgd else if (OP(scan) == BOL)
265 1.1 cgd r->reganch++;
266 1.1 cgd
267 1.1 cgd /*
268 1.1 cgd * If there's something expensive in the r.e., find the
269 1.1 cgd * longest literal string that must appear and make it the
270 1.1 cgd * regmust. Resolve ties in favor of later strings, since
271 1.1 cgd * the regstart check works with the beginning of the r.e.
272 1.1 cgd * and avoiding duplication strengthens checking. Not a
273 1.1 cgd * strong reason, but sufficient in the absence of others.
274 1.1 cgd */
275 1.1 cgd if (flags&SPSTART) {
276 1.1 cgd longest = NULL;
277 1.1 cgd len = 0;
278 1.1 cgd for (; scan != NULL; scan = regnext(scan))
279 1.1 cgd if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
280 1.1 cgd longest = OPERAND(scan);
281 1.1 cgd len = strlen(OPERAND(scan));
282 1.1 cgd }
283 1.1 cgd r->regmust = longest;
284 1.1 cgd r->regmlen = len;
285 1.1 cgd }
286 1.1 cgd }
287 1.1 cgd
288 1.1 cgd return(r);
289 1.1 cgd }
290 1.1 cgd
291 1.1 cgd /*
292 1.1 cgd - reg - regular expression, i.e. main body or parenthesized thing
293 1.1 cgd *
294 1.1 cgd * Caller must absorb opening parenthesis.
295 1.1 cgd *
296 1.1 cgd * Combining parenthesis handling with the base level of regular expression
297 1.1 cgd * is a trifle forced, but the need to tie the tails of the branches to what
298 1.1 cgd * follows makes it hard to avoid.
299 1.1 cgd */
300 1.1 cgd static char *
301 1.1 cgd reg(paren, flagp)
302 1.1 cgd int paren; /* Parenthesized? */
303 1.1 cgd int *flagp;
304 1.1 cgd {
305 1.8 perry char *ret;
306 1.8 perry char *br;
307 1.8 perry char *ender;
308 1.8 perry int parno = 0;
309 1.1 cgd int flags;
310 1.1 cgd
311 1.1 cgd *flagp = HASWIDTH; /* Tentatively. */
312 1.1 cgd
313 1.1 cgd /* Make an OPEN node, if parenthesized. */
314 1.1 cgd if (paren) {
315 1.1 cgd if (regnpar >= NSUBEXP)
316 1.1 cgd FAIL("too many ()");
317 1.1 cgd parno = regnpar;
318 1.1 cgd regnpar++;
319 1.1 cgd ret = regnode(OPEN+parno);
320 1.1 cgd } else
321 1.1 cgd ret = NULL;
322 1.1 cgd
323 1.1 cgd /* Pick up the branches, linking them together. */
324 1.1 cgd br = regbranch(&flags);
325 1.1 cgd if (br == NULL)
326 1.1 cgd return(NULL);
327 1.1 cgd if (ret != NULL)
328 1.1 cgd regtail(ret, br); /* OPEN -> first. */
329 1.1 cgd else
330 1.1 cgd ret = br;
331 1.1 cgd if (!(flags&HASWIDTH))
332 1.1 cgd *flagp &= ~HASWIDTH;
333 1.1 cgd *flagp |= flags&SPSTART;
334 1.1 cgd while (*regparse == '|' || *regparse == '\n') {
335 1.1 cgd regparse++;
336 1.1 cgd br = regbranch(&flags);
337 1.1 cgd if (br == NULL)
338 1.1 cgd return(NULL);
339 1.1 cgd regtail(ret, br); /* BRANCH -> BRANCH. */
340 1.1 cgd if (!(flags&HASWIDTH))
341 1.1 cgd *flagp &= ~HASWIDTH;
342 1.1 cgd *flagp |= flags&SPSTART;
343 1.1 cgd }
344 1.1 cgd
345 1.1 cgd /* Make a closing node, and hook it on the end. */
346 1.11 simonb ender = regnode((paren) ? CLOSE+parno : END);
347 1.1 cgd regtail(ret, ender);
348 1.1 cgd
349 1.1 cgd /* Hook the tails of the branches to the closing node. */
350 1.1 cgd for (br = ret; br != NULL; br = regnext(br))
351 1.1 cgd regoptail(br, ender);
352 1.1 cgd
353 1.1 cgd /* Check for proper termination. */
354 1.1 cgd if (paren && *regparse++ != ')') {
355 1.1 cgd FAIL("unmatched ()");
356 1.1 cgd } else if (!paren && *regparse != '\0') {
357 1.1 cgd if (*regparse == ')') {
358 1.1 cgd FAIL("unmatched ()");
359 1.1 cgd } else
360 1.1 cgd FAIL("junk on end"); /* "Can't happen". */
361 1.1 cgd /* NOTREACHED */
362 1.1 cgd }
363 1.1 cgd
364 1.1 cgd return(ret);
365 1.1 cgd }
366 1.1 cgd
367 1.1 cgd /*
368 1.1 cgd - regbranch - one alternative of an | operator
369 1.1 cgd *
370 1.1 cgd * Implements the concatenation operator.
371 1.1 cgd */
372 1.1 cgd static char *
373 1.1 cgd regbranch(flagp)
374 1.1 cgd int *flagp;
375 1.1 cgd {
376 1.8 perry char *ret;
377 1.8 perry char *chain;
378 1.8 perry char *latest;
379 1.1 cgd int flags;
380 1.1 cgd
381 1.1 cgd *flagp = WORST; /* Tentatively. */
382 1.1 cgd
383 1.1 cgd ret = regnode(BRANCH);
384 1.1 cgd chain = NULL;
385 1.1 cgd while (*regparse != '\0' && *regparse != ')' &&
386 1.1 cgd *regparse != '\n' && *regparse != '|') {
387 1.1 cgd latest = regpiece(&flags);
388 1.1 cgd if (latest == NULL)
389 1.1 cgd return(NULL);
390 1.1 cgd *flagp |= flags&HASWIDTH;
391 1.1 cgd if (chain == NULL) /* First piece. */
392 1.1 cgd *flagp |= flags&SPSTART;
393 1.1 cgd else
394 1.1 cgd regtail(chain, latest);
395 1.1 cgd chain = latest;
396 1.1 cgd }
397 1.1 cgd if (chain == NULL) /* Loop ran zero times. */
398 1.1 cgd (void) regnode(NOTHING);
399 1.1 cgd
400 1.1 cgd return(ret);
401 1.1 cgd }
402 1.1 cgd
403 1.1 cgd /*
404 1.1 cgd - regpiece - something followed by possible [*+?]
405 1.1 cgd *
406 1.1 cgd * Note that the branching code sequences used for ? and the general cases
407 1.1 cgd * of * and + are somewhat optimized: they use the same NOTHING node as
408 1.1 cgd * both the endmarker for their branch list and the body of the last branch.
409 1.1 cgd * It might seem that this node could be dispensed with entirely, but the
410 1.1 cgd * endmarker role is not redundant.
411 1.1 cgd */
412 1.1 cgd static char *
413 1.1 cgd regpiece(flagp)
414 1.1 cgd int *flagp;
415 1.1 cgd {
416 1.8 perry char *ret;
417 1.8 perry char op;
418 1.8 perry char *next;
419 1.1 cgd int flags;
420 1.1 cgd
421 1.1 cgd ret = regatom(&flags);
422 1.1 cgd if (ret == NULL)
423 1.1 cgd return(NULL);
424 1.1 cgd
425 1.1 cgd op = *regparse;
426 1.1 cgd if (!ISMULT(op)) {
427 1.1 cgd *flagp = flags;
428 1.1 cgd return(ret);
429 1.1 cgd }
430 1.1 cgd
431 1.1 cgd if (!(flags&HASWIDTH) && op != '?')
432 1.1 cgd FAIL("*+ operand could be empty");
433 1.1 cgd *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
434 1.1 cgd
435 1.1 cgd if (op == '*' && (flags&SIMPLE))
436 1.1 cgd reginsert(STAR, ret);
437 1.1 cgd else if (op == '*') {
438 1.1 cgd /* Emit x* as (x&|), where & means "self". */
439 1.1 cgd reginsert(BRANCH, ret); /* Either x */
440 1.1 cgd regoptail(ret, regnode(BACK)); /* and loop */
441 1.1 cgd regoptail(ret, ret); /* back */
442 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
443 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
444 1.1 cgd } else if (op == '+' && (flags&SIMPLE))
445 1.1 cgd reginsert(PLUS, ret);
446 1.1 cgd else if (op == '+') {
447 1.1 cgd /* Emit x+ as x(&|), where & means "self". */
448 1.1 cgd next = regnode(BRANCH); /* Either */
449 1.1 cgd regtail(ret, next);
450 1.1 cgd regtail(regnode(BACK), ret); /* loop back */
451 1.1 cgd regtail(next, regnode(BRANCH)); /* or */
452 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
453 1.1 cgd } else if (op == '?') {
454 1.1 cgd /* Emit x? as (x|) */
455 1.1 cgd reginsert(BRANCH, ret); /* Either x */
456 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
457 1.1 cgd next = regnode(NOTHING); /* null. */
458 1.1 cgd regtail(ret, next);
459 1.1 cgd regoptail(ret, next);
460 1.1 cgd }
461 1.1 cgd regparse++;
462 1.1 cgd if (ISMULT(*regparse))
463 1.1 cgd FAIL("nested *?+");
464 1.1 cgd
465 1.1 cgd return(ret);
466 1.1 cgd }
467 1.1 cgd
468 1.1 cgd /*
469 1.1 cgd - regatom - the lowest level
470 1.1 cgd *
471 1.1 cgd * Optimization: gobbles an entire sequence of ordinary characters so that
472 1.1 cgd * it can turn them into a single node, which is smaller to store and
473 1.1 cgd * faster to run. Backslashed characters are exceptions, each becoming a
474 1.1 cgd * separate node; the code is simpler that way and it's not worth fixing.
475 1.1 cgd */
476 1.1 cgd static char *
477 1.1 cgd regatom(flagp)
478 1.1 cgd int *flagp;
479 1.1 cgd {
480 1.8 perry char *ret;
481 1.1 cgd int flags;
482 1.1 cgd
483 1.1 cgd *flagp = WORST; /* Tentatively. */
484 1.1 cgd
485 1.1 cgd switch (*regparse++) {
486 1.1 cgd /* FIXME: these chars only have meaning at beg/end of pat? */
487 1.1 cgd case '^':
488 1.1 cgd ret = regnode(BOL);
489 1.1 cgd break;
490 1.1 cgd case '$':
491 1.1 cgd ret = regnode(EOL);
492 1.1 cgd break;
493 1.1 cgd case '.':
494 1.1 cgd ret = regnode(ANY);
495 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
496 1.1 cgd break;
497 1.1 cgd case '[': {
498 1.8 perry int class;
499 1.8 perry int classend;
500 1.1 cgd
501 1.1 cgd if (*regparse == '^') { /* Complement of range. */
502 1.1 cgd ret = regnode(ANYBUT);
503 1.1 cgd regparse++;
504 1.1 cgd } else
505 1.1 cgd ret = regnode(ANYOF);
506 1.1 cgd if (*regparse == ']' || *regparse == '-')
507 1.1 cgd regc(*regparse++);
508 1.1 cgd while (*regparse != '\0' && *regparse != ']') {
509 1.1 cgd if (*regparse == '-') {
510 1.1 cgd regparse++;
511 1.1 cgd if (*regparse == ']' || *regparse == '\0')
512 1.1 cgd regc('-');
513 1.1 cgd else {
514 1.1 cgd class = UCHARAT(regparse-2)+1;
515 1.1 cgd classend = UCHARAT(regparse);
516 1.1 cgd if (class > classend+1)
517 1.1 cgd FAIL("invalid [] range");
518 1.1 cgd for (; class <= classend; class++)
519 1.1 cgd regc(class);
520 1.1 cgd regparse++;
521 1.1 cgd }
522 1.1 cgd } else
523 1.1 cgd regc(*regparse++);
524 1.1 cgd }
525 1.1 cgd regc('\0');
526 1.1 cgd if (*regparse != ']')
527 1.1 cgd FAIL("unmatched []");
528 1.1 cgd regparse++;
529 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
530 1.1 cgd }
531 1.1 cgd break;
532 1.1 cgd case '(':
533 1.1 cgd ret = reg(1, &flags);
534 1.1 cgd if (ret == NULL)
535 1.1 cgd return(NULL);
536 1.1 cgd *flagp |= flags&(HASWIDTH|SPSTART);
537 1.1 cgd break;
538 1.1 cgd case '\0':
539 1.1 cgd case '|':
540 1.1 cgd case '\n':
541 1.1 cgd case ')':
542 1.1 cgd FAIL("internal urp"); /* Supposed to be caught earlier. */
543 1.1 cgd case '?':
544 1.1 cgd case '+':
545 1.1 cgd case '*':
546 1.1 cgd FAIL("?+* follows nothing");
547 1.1 cgd case '\\':
548 1.1 cgd switch (*regparse++) {
549 1.1 cgd case '\0':
550 1.1 cgd FAIL("trailing \\");
551 1.1 cgd case '<':
552 1.1 cgd ret = regnode(WORDA);
553 1.1 cgd break;
554 1.1 cgd case '>':
555 1.1 cgd ret = regnode(WORDZ);
556 1.1 cgd break;
557 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
558 1.1 cgd default:
559 1.1 cgd /* Handle general quoted chars in exact-match routine */
560 1.1 cgd goto de_fault;
561 1.1 cgd }
562 1.1 cgd break;
563 1.1 cgd de_fault:
564 1.10 christos /*FALLTHROUGH*/
565 1.1 cgd default:
566 1.1 cgd /*
567 1.1 cgd * Encode a string of characters to be matched exactly.
568 1.1 cgd *
569 1.1 cgd * This is a bit tricky due to quoted chars and due to
570 1.1 cgd * '*', '+', and '?' taking the SINGLE char previous
571 1.1 cgd * as their operand.
572 1.1 cgd *
573 1.1 cgd * On entry, the char at regparse[-1] is going to go
574 1.1 cgd * into the string, no matter what it is. (It could be
575 1.1 cgd * following a \ if we are entered from the '\' case.)
576 1.11 simonb *
577 1.1 cgd * Basic idea is to pick up a good char in ch and
578 1.1 cgd * examine the next char. If it's *+? then we twiddle.
579 1.1 cgd * If it's \ then we frozzle. If it's other magic char
580 1.1 cgd * we push ch and terminate the string. If none of the
581 1.1 cgd * above, we push ch on the string and go around again.
582 1.1 cgd *
583 1.1 cgd * regprev is used to remember where "the current char"
584 1.1 cgd * starts in the string, if due to a *+? we need to back
585 1.1 cgd * up and put the current char in a separate, 1-char, string.
586 1.1 cgd * When regprev is NULL, ch is the only char in the
587 1.1 cgd * string; this is used in *+? handling, and in setting
588 1.1 cgd * flags |= SIMPLE at the end.
589 1.1 cgd */
590 1.1 cgd {
591 1.1 cgd char *regprev;
592 1.8 perry char ch;
593 1.1 cgd
594 1.1 cgd regparse--; /* Look at cur char */
595 1.1 cgd ret = regnode(EXACTLY);
596 1.1 cgd for ( regprev = 0 ; ; ) {
597 1.1 cgd ch = *regparse++; /* Get current char */
598 1.1 cgd switch (*regparse) { /* look at next one */
599 1.1 cgd
600 1.1 cgd default:
601 1.1 cgd regc(ch); /* Add cur to string */
602 1.1 cgd break;
603 1.1 cgd
604 1.1 cgd case '.': case '[': case '(':
605 1.1 cgd case ')': case '|': case '\n':
606 1.1 cgd case '$': case '^':
607 1.1 cgd case '\0':
608 1.1 cgd /* FIXME, $ and ^ should not always be magic */
609 1.1 cgd magic:
610 1.1 cgd regc(ch); /* dump cur char */
611 1.1 cgd goto done; /* and we are done */
612 1.1 cgd
613 1.1 cgd case '?': case '+': case '*':
614 1.1 cgd if (!regprev) /* If just ch in str, */
615 1.1 cgd goto magic; /* use it */
616 1.1 cgd /* End mult-char string one early */
617 1.1 cgd regparse = regprev; /* Back up parse */
618 1.1 cgd goto done;
619 1.1 cgd
620 1.1 cgd case '\\':
621 1.1 cgd regc(ch); /* Cur char OK */
622 1.1 cgd switch (regparse[1]){ /* Look after \ */
623 1.1 cgd case '\0':
624 1.1 cgd case '<':
625 1.1 cgd case '>':
626 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
627 1.1 cgd goto done; /* Not quoted */
628 1.1 cgd default:
629 1.1 cgd /* Backup point is \, scan * point is after it. */
630 1.1 cgd regprev = regparse;
631 1.11 simonb regparse++;
632 1.1 cgd continue; /* NOT break; */
633 1.1 cgd }
634 1.1 cgd }
635 1.1 cgd regprev = regparse; /* Set backup point */
636 1.1 cgd }
637 1.1 cgd done:
638 1.1 cgd regc('\0');
639 1.1 cgd *flagp |= HASWIDTH;
640 1.1 cgd if (!regprev) /* One char? */
641 1.1 cgd *flagp |= SIMPLE;
642 1.1 cgd }
643 1.1 cgd break;
644 1.1 cgd }
645 1.1 cgd
646 1.1 cgd return(ret);
647 1.1 cgd }
648 1.1 cgd
649 1.1 cgd /*
650 1.1 cgd - regnode - emit a node
651 1.1 cgd */
652 1.1 cgd static char * /* Location. */
653 1.1 cgd regnode(op)
654 1.10 christos int op;
655 1.1 cgd {
656 1.8 perry char *ret;
657 1.8 perry char *ptr;
658 1.1 cgd
659 1.1 cgd ret = regcode;
660 1.1 cgd if (ret == ®dummy) {
661 1.1 cgd regsize += 3;
662 1.1 cgd return(ret);
663 1.1 cgd }
664 1.1 cgd
665 1.1 cgd ptr = ret;
666 1.1 cgd *ptr++ = op;
667 1.1 cgd *ptr++ = '\0'; /* Null "next" pointer. */
668 1.1 cgd *ptr++ = '\0';
669 1.1 cgd regcode = ptr;
670 1.1 cgd
671 1.1 cgd return(ret);
672 1.1 cgd }
673 1.1 cgd
674 1.1 cgd /*
675 1.1 cgd - regc - emit (if appropriate) a byte of code
676 1.1 cgd */
677 1.1 cgd static void
678 1.1 cgd regc(b)
679 1.10 christos int b;
680 1.1 cgd {
681 1.1 cgd if (regcode != ®dummy)
682 1.1 cgd *regcode++ = b;
683 1.1 cgd else
684 1.1 cgd regsize++;
685 1.1 cgd }
686 1.1 cgd
687 1.1 cgd /*
688 1.1 cgd - reginsert - insert an operator in front of already-emitted operand
689 1.1 cgd *
690 1.1 cgd * Means relocating the operand.
691 1.1 cgd */
692 1.1 cgd static void
693 1.1 cgd reginsert(op, opnd)
694 1.10 christos int op;
695 1.1 cgd char *opnd;
696 1.1 cgd {
697 1.8 perry char *src;
698 1.8 perry char *dst;
699 1.8 perry char *place;
700 1.1 cgd
701 1.1 cgd if (regcode == ®dummy) {
702 1.1 cgd regsize += 3;
703 1.1 cgd return;
704 1.1 cgd }
705 1.1 cgd
706 1.1 cgd src = regcode;
707 1.1 cgd regcode += 3;
708 1.1 cgd dst = regcode;
709 1.1 cgd while (src > opnd)
710 1.1 cgd *--dst = *--src;
711 1.1 cgd
712 1.1 cgd place = opnd; /* Op node, where operand used to be. */
713 1.1 cgd *place++ = op;
714 1.1 cgd *place++ = '\0';
715 1.1 cgd *place++ = '\0';
716 1.1 cgd }
717 1.1 cgd
718 1.1 cgd /*
719 1.1 cgd - regtail - set the next-pointer at the end of a node chain
720 1.1 cgd */
721 1.1 cgd static void
722 1.1 cgd regtail(p, val)
723 1.1 cgd char *p;
724 1.1 cgd char *val;
725 1.1 cgd {
726 1.8 perry char *scan;
727 1.8 perry char *temp;
728 1.8 perry int offset;
729 1.1 cgd
730 1.1 cgd if (p == ®dummy)
731 1.1 cgd return;
732 1.1 cgd
733 1.1 cgd /* Find last node. */
734 1.1 cgd scan = p;
735 1.1 cgd for (;;) {
736 1.1 cgd temp = regnext(scan);
737 1.1 cgd if (temp == NULL)
738 1.1 cgd break;
739 1.1 cgd scan = temp;
740 1.1 cgd }
741 1.1 cgd
742 1.1 cgd if (OP(scan) == BACK)
743 1.1 cgd offset = scan - val;
744 1.1 cgd else
745 1.1 cgd offset = val - scan;
746 1.10 christos *(scan+1) = ((unsigned int)offset>>8)&0377;
747 1.1 cgd *(scan+2) = offset&0377;
748 1.1 cgd }
749 1.1 cgd
750 1.1 cgd /*
751 1.1 cgd - regoptail - regtail on operand of first argument; nop if operandless
752 1.1 cgd */
753 1.1 cgd static void
754 1.1 cgd regoptail(p, val)
755 1.1 cgd char *p;
756 1.1 cgd char *val;
757 1.1 cgd {
758 1.1 cgd /* "Operandless" and "op != BRANCH" are synonymous in practice. */
759 1.1 cgd if (p == NULL || p == ®dummy || OP(p) != BRANCH)
760 1.1 cgd return;
761 1.1 cgd regtail(OPERAND(p), val);
762 1.1 cgd }
763 1.1 cgd
764 1.1 cgd /*
765 1.1 cgd * regexec and friends
766 1.1 cgd */
767 1.1 cgd
768 1.1 cgd /*
769 1.1 cgd * Global work variables for regexec().
770 1.1 cgd */
771 1.1 cgd static char *reginput; /* String-input pointer. */
772 1.1 cgd static char *regbol; /* Beginning of input, for ^ check. */
773 1.1 cgd static char **regstartp; /* Pointer to startp array. */
774 1.1 cgd static char **regendp; /* Ditto for endp. */
775 1.1 cgd
776 1.1 cgd /*
777 1.1 cgd * Forwards.
778 1.1 cgd */
779 1.4 pk STATIC int regtry __P((const regexp *, const char *));
780 1.4 pk STATIC int regmatch __P((char *));
781 1.4 pk STATIC int regrepeat __P((char *));
782 1.1 cgd
783 1.1 cgd #ifdef DEBUG
784 1.1 cgd int regnarrate = 0;
785 1.4 pk void regdump __P((regexp *));
786 1.4 pk STATIC char *regprop __P((char *));
787 1.1 cgd #endif
788 1.1 cgd
789 1.1 cgd /*
790 1.1 cgd - regexec - match a regexp against a string
791 1.1 cgd */
792 1.1 cgd int
793 1.9 tv __compat_regexec(prog, string)
794 1.8 perry const regexp *prog;
795 1.8 perry const char *string;
796 1.1 cgd {
797 1.8 perry char *s;
798 1.1 cgd
799 1.1 cgd /* Be paranoid... */
800 1.1 cgd if (prog == NULL || string == NULL) {
801 1.1 cgd regerror("NULL parameter");
802 1.1 cgd return(0);
803 1.1 cgd }
804 1.1 cgd
805 1.1 cgd /* Check validity of program. */
806 1.1 cgd if (UCHARAT(prog->program) != MAGIC) {
807 1.1 cgd regerror("corrupted program");
808 1.1 cgd return(0);
809 1.1 cgd }
810 1.1 cgd
811 1.1 cgd /* If there is a "must appear" string, look for it. */
812 1.1 cgd if (prog->regmust != NULL) {
813 1.10 christos /* LINTED const castaway */
814 1.1 cgd s = (char *)string;
815 1.1 cgd while ((s = strchr(s, prog->regmust[0])) != NULL) {
816 1.10 christos if (strncmp(s, prog->regmust,
817 1.10 christos (size_t)prog->regmlen) == 0)
818 1.1 cgd break; /* Found it. */
819 1.1 cgd s++;
820 1.1 cgd }
821 1.1 cgd if (s == NULL) /* Not present. */
822 1.1 cgd return(0);
823 1.1 cgd }
824 1.1 cgd
825 1.1 cgd /* Mark beginning of line for ^ . */
826 1.10 christos /* LINTED const castaway */
827 1.1 cgd regbol = (char *)string;
828 1.1 cgd
829 1.1 cgd /* Simplest case: anchored match need be tried only once. */
830 1.1 cgd if (prog->reganch)
831 1.1 cgd return(regtry(prog, string));
832 1.1 cgd
833 1.1 cgd /* Messy cases: unanchored match. */
834 1.10 christos /* LINTED const castaway */
835 1.1 cgd s = (char *)string;
836 1.1 cgd if (prog->regstart != '\0')
837 1.1 cgd /* We know what char it must start with. */
838 1.1 cgd while ((s = strchr(s, prog->regstart)) != NULL) {
839 1.1 cgd if (regtry(prog, s))
840 1.1 cgd return(1);
841 1.1 cgd s++;
842 1.1 cgd }
843 1.1 cgd else
844 1.1 cgd /* We don't -- general case. */
845 1.1 cgd do {
846 1.1 cgd if (regtry(prog, s))
847 1.1 cgd return(1);
848 1.1 cgd } while (*s++ != '\0');
849 1.1 cgd
850 1.1 cgd /* Failure. */
851 1.1 cgd return(0);
852 1.1 cgd }
853 1.1 cgd
854 1.1 cgd /*
855 1.1 cgd - regtry - try match at specific point
856 1.1 cgd */
857 1.1 cgd static int /* 0 failure, 1 success */
858 1.1 cgd regtry(prog, string)
859 1.4 pk const regexp *prog;
860 1.4 pk const char *string;
861 1.1 cgd {
862 1.8 perry int i;
863 1.8 perry char **sp;
864 1.8 perry char **ep;
865 1.1 cgd
866 1.10 christos /* LINTED const castaway */
867 1.5 cgd reginput = (char *)string; /* XXX */
868 1.5 cgd regstartp = (char **)prog->startp; /* XXX */
869 1.5 cgd regendp = (char **)prog->endp; /* XXX */
870 1.1 cgd
871 1.5 cgd sp = (char **)prog->startp; /* XXX */
872 1.5 cgd ep = (char **)prog->endp; /* XXX */
873 1.1 cgd for (i = NSUBEXP; i > 0; i--) {
874 1.1 cgd *sp++ = NULL;
875 1.1 cgd *ep++ = NULL;
876 1.1 cgd }
877 1.5 cgd if (regmatch((char *)prog->program + 1)) { /* XXX */
878 1.10 christos /* LINTED const castaway */
879 1.5 cgd ((regexp *)prog)->startp[0] = (char *)string; /* XXX */
880 1.10 christos /* LINTED const castaway */
881 1.5 cgd ((regexp *)prog)->endp[0] = reginput; /* XXX */
882 1.1 cgd return(1);
883 1.1 cgd } else
884 1.1 cgd return(0);
885 1.1 cgd }
886 1.1 cgd
887 1.1 cgd /*
888 1.1 cgd - regmatch - main matching routine
889 1.1 cgd *
890 1.1 cgd * Conceptually the strategy is simple: check to see whether the current
891 1.1 cgd * node matches, call self recursively to see whether the rest matches,
892 1.1 cgd * and then act accordingly. In practice we make some effort to avoid
893 1.1 cgd * recursion, in particular by going through "ordinary" nodes (that don't
894 1.1 cgd * need to know whether the rest of the match failed) by a loop instead of
895 1.1 cgd * by recursion.
896 1.1 cgd */
897 1.1 cgd static int /* 0 failure, 1 success */
898 1.1 cgd regmatch(prog)
899 1.1 cgd char *prog;
900 1.1 cgd {
901 1.8 perry char *scan; /* Current node. */
902 1.1 cgd char *next; /* Next node. */
903 1.1 cgd
904 1.1 cgd scan = prog;
905 1.1 cgd #ifdef DEBUG
906 1.1 cgd if (scan != NULL && regnarrate)
907 1.1 cgd fprintf(stderr, "%s(\n", regprop(scan));
908 1.1 cgd #endif
909 1.1 cgd while (scan != NULL) {
910 1.1 cgd #ifdef DEBUG
911 1.1 cgd if (regnarrate)
912 1.1 cgd fprintf(stderr, "%s...\n", regprop(scan));
913 1.1 cgd #endif
914 1.1 cgd next = regnext(scan);
915 1.1 cgd
916 1.1 cgd switch (OP(scan)) {
917 1.1 cgd case BOL:
918 1.1 cgd if (reginput != regbol)
919 1.1 cgd return(0);
920 1.1 cgd break;
921 1.1 cgd case EOL:
922 1.1 cgd if (*reginput != '\0')
923 1.1 cgd return(0);
924 1.1 cgd break;
925 1.1 cgd case WORDA:
926 1.1 cgd /* Must be looking at a letter, digit, or _ */
927 1.13 itohy if ((!isalnum(UCHARAT(reginput))) && *reginput != '_')
928 1.1 cgd return(0);
929 1.1 cgd /* Prev must be BOL or nonword */
930 1.1 cgd if (reginput > regbol &&
931 1.13 itohy (isalnum(UCHARAT(reginput - 1))
932 1.13 itohy || reginput[-1] == '_'))
933 1.1 cgd return(0);
934 1.1 cgd break;
935 1.1 cgd case WORDZ:
936 1.1 cgd /* Must be looking at non letter, digit, or _ */
937 1.13 itohy if (isalnum(UCHARAT(reginput)) || *reginput == '_')
938 1.1 cgd return(0);
939 1.1 cgd /* We don't care what the previous char was */
940 1.1 cgd break;
941 1.1 cgd case ANY:
942 1.1 cgd if (*reginput == '\0')
943 1.1 cgd return(0);
944 1.1 cgd reginput++;
945 1.1 cgd break;
946 1.1 cgd case EXACTLY: {
947 1.8 perry int len;
948 1.8 perry char *opnd;
949 1.1 cgd
950 1.1 cgd opnd = OPERAND(scan);
951 1.1 cgd /* Inline the first character, for speed. */
952 1.1 cgd if (*opnd != *reginput)
953 1.1 cgd return(0);
954 1.1 cgd len = strlen(opnd);
955 1.11 simonb if (len > 1 && strncmp(opnd, reginput,
956 1.10 christos (size_t)len) != 0)
957 1.1 cgd return(0);
958 1.1 cgd reginput += len;
959 1.1 cgd }
960 1.1 cgd break;
961 1.1 cgd case ANYOF:
962 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
963 1.1 cgd return(0);
964 1.1 cgd reginput++;
965 1.1 cgd break;
966 1.1 cgd case ANYBUT:
967 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
968 1.1 cgd return(0);
969 1.1 cgd reginput++;
970 1.1 cgd break;
971 1.1 cgd case NOTHING:
972 1.1 cgd break;
973 1.1 cgd case BACK:
974 1.1 cgd break;
975 1.1 cgd case OPEN+1:
976 1.1 cgd case OPEN+2:
977 1.1 cgd case OPEN+3:
978 1.1 cgd case OPEN+4:
979 1.1 cgd case OPEN+5:
980 1.1 cgd case OPEN+6:
981 1.1 cgd case OPEN+7:
982 1.1 cgd case OPEN+8:
983 1.1 cgd case OPEN+9: {
984 1.8 perry int no;
985 1.8 perry char *save;
986 1.1 cgd
987 1.1 cgd no = OP(scan) - OPEN;
988 1.1 cgd save = reginput;
989 1.1 cgd
990 1.1 cgd if (regmatch(next)) {
991 1.1 cgd /*
992 1.1 cgd * Don't set startp if some later
993 1.1 cgd * invocation of the same parentheses
994 1.1 cgd * already has.
995 1.1 cgd */
996 1.1 cgd if (regstartp[no] == NULL)
997 1.1 cgd regstartp[no] = save;
998 1.1 cgd return(1);
999 1.1 cgd } else
1000 1.1 cgd return(0);
1001 1.1 cgd }
1002 1.1 cgd case CLOSE+1:
1003 1.1 cgd case CLOSE+2:
1004 1.1 cgd case CLOSE+3:
1005 1.1 cgd case CLOSE+4:
1006 1.1 cgd case CLOSE+5:
1007 1.1 cgd case CLOSE+6:
1008 1.1 cgd case CLOSE+7:
1009 1.1 cgd case CLOSE+8:
1010 1.1 cgd case CLOSE+9: {
1011 1.8 perry int no;
1012 1.8 perry char *save;
1013 1.1 cgd
1014 1.1 cgd no = OP(scan) - CLOSE;
1015 1.1 cgd save = reginput;
1016 1.1 cgd
1017 1.1 cgd if (regmatch(next)) {
1018 1.1 cgd /*
1019 1.1 cgd * Don't set endp if some later
1020 1.1 cgd * invocation of the same parentheses
1021 1.1 cgd * already has.
1022 1.1 cgd */
1023 1.1 cgd if (regendp[no] == NULL)
1024 1.1 cgd regendp[no] = save;
1025 1.1 cgd return(1);
1026 1.1 cgd } else
1027 1.1 cgd return(0);
1028 1.1 cgd }
1029 1.1 cgd case BRANCH: {
1030 1.8 perry char *save;
1031 1.1 cgd
1032 1.1 cgd if (OP(next) != BRANCH) /* No choice. */
1033 1.1 cgd next = OPERAND(scan); /* Avoid recursion. */
1034 1.1 cgd else {
1035 1.1 cgd do {
1036 1.1 cgd save = reginput;
1037 1.1 cgd if (regmatch(OPERAND(scan)))
1038 1.1 cgd return(1);
1039 1.1 cgd reginput = save;
1040 1.1 cgd scan = regnext(scan);
1041 1.1 cgd } while (scan != NULL && OP(scan) == BRANCH);
1042 1.1 cgd return(0);
1043 1.1 cgd /* NOTREACHED */
1044 1.1 cgd }
1045 1.1 cgd }
1046 1.1 cgd break;
1047 1.1 cgd case STAR:
1048 1.1 cgd case PLUS: {
1049 1.8 perry char nextch;
1050 1.8 perry int no;
1051 1.8 perry char *save;
1052 1.8 perry int min;
1053 1.1 cgd
1054 1.1 cgd /*
1055 1.1 cgd * Lookahead to avoid useless match attempts
1056 1.1 cgd * when we know what character comes next.
1057 1.1 cgd */
1058 1.1 cgd nextch = '\0';
1059 1.1 cgd if (OP(next) == EXACTLY)
1060 1.1 cgd nextch = *OPERAND(next);
1061 1.1 cgd min = (OP(scan) == STAR) ? 0 : 1;
1062 1.1 cgd save = reginput;
1063 1.1 cgd no = regrepeat(OPERAND(scan));
1064 1.1 cgd while (no >= min) {
1065 1.1 cgd /* If it could work, try it. */
1066 1.1 cgd if (nextch == '\0' || *reginput == nextch)
1067 1.1 cgd if (regmatch(next))
1068 1.1 cgd return(1);
1069 1.1 cgd /* Couldn't or didn't -- back up. */
1070 1.1 cgd no--;
1071 1.1 cgd reginput = save + no;
1072 1.1 cgd }
1073 1.1 cgd return(0);
1074 1.1 cgd }
1075 1.1 cgd case END:
1076 1.1 cgd return(1); /* Success! */
1077 1.1 cgd default:
1078 1.1 cgd regerror("memory corruption");
1079 1.1 cgd return(0);
1080 1.1 cgd }
1081 1.1 cgd
1082 1.1 cgd scan = next;
1083 1.1 cgd }
1084 1.1 cgd
1085 1.1 cgd /*
1086 1.1 cgd * We get here only if there's trouble -- normally "case END" is
1087 1.1 cgd * the terminating point.
1088 1.1 cgd */
1089 1.1 cgd regerror("corrupted pointers");
1090 1.1 cgd return(0);
1091 1.1 cgd }
1092 1.1 cgd
1093 1.1 cgd /*
1094 1.1 cgd - regrepeat - repeatedly match something simple, report how many
1095 1.1 cgd */
1096 1.1 cgd static int
1097 1.1 cgd regrepeat(p)
1098 1.1 cgd char *p;
1099 1.1 cgd {
1100 1.8 perry int count = 0;
1101 1.8 perry char *scan;
1102 1.8 perry char *opnd;
1103 1.1 cgd
1104 1.1 cgd scan = reginput;
1105 1.1 cgd opnd = OPERAND(p);
1106 1.1 cgd switch (OP(p)) {
1107 1.1 cgd case ANY:
1108 1.1 cgd count = strlen(scan);
1109 1.1 cgd scan += count;
1110 1.1 cgd break;
1111 1.1 cgd case EXACTLY:
1112 1.1 cgd while (*opnd == *scan) {
1113 1.1 cgd count++;
1114 1.1 cgd scan++;
1115 1.1 cgd }
1116 1.1 cgd break;
1117 1.1 cgd case ANYOF:
1118 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1119 1.1 cgd count++;
1120 1.1 cgd scan++;
1121 1.1 cgd }
1122 1.1 cgd break;
1123 1.1 cgd case ANYBUT:
1124 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1125 1.1 cgd count++;
1126 1.1 cgd scan++;
1127 1.1 cgd }
1128 1.1 cgd break;
1129 1.1 cgd default: /* Oh dear. Called inappropriately. */
1130 1.1 cgd regerror("internal foulup");
1131 1.1 cgd count = 0; /* Best compromise. */
1132 1.1 cgd break;
1133 1.1 cgd }
1134 1.1 cgd reginput = scan;
1135 1.1 cgd
1136 1.1 cgd return(count);
1137 1.1 cgd }
1138 1.1 cgd
1139 1.1 cgd /*
1140 1.1 cgd - regnext - dig the "next" pointer out of a node
1141 1.1 cgd */
1142 1.1 cgd static char *
1143 1.1 cgd regnext(p)
1144 1.8 perry char *p;
1145 1.1 cgd {
1146 1.8 perry int offset;
1147 1.1 cgd
1148 1.1 cgd if (p == ®dummy)
1149 1.1 cgd return(NULL);
1150 1.1 cgd
1151 1.1 cgd offset = NEXT(p);
1152 1.1 cgd if (offset == 0)
1153 1.1 cgd return(NULL);
1154 1.1 cgd
1155 1.1 cgd if (OP(p) == BACK)
1156 1.1 cgd return(p-offset);
1157 1.1 cgd else
1158 1.1 cgd return(p+offset);
1159 1.1 cgd }
1160 1.1 cgd
1161 1.1 cgd #ifdef DEBUG
1162 1.1 cgd
1163 1.1 cgd /*
1164 1.1 cgd - regdump - dump a regexp onto stdout in vaguely comprehensible form
1165 1.1 cgd */
1166 1.1 cgd void
1167 1.1 cgd regdump(r)
1168 1.1 cgd regexp *r;
1169 1.1 cgd {
1170 1.8 perry char *s;
1171 1.8 perry char op = EXACTLY; /* Arbitrary non-END op. */
1172 1.8 perry char *next;
1173 1.1 cgd extern char *strchr();
1174 1.1 cgd
1175 1.1 cgd
1176 1.1 cgd s = r->program + 1;
1177 1.1 cgd while (op != END) { /* While that wasn't END last time... */
1178 1.1 cgd op = OP(s);
1179 1.1 cgd printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1180 1.1 cgd next = regnext(s);
1181 1.1 cgd if (next == NULL) /* Next ptr. */
1182 1.1 cgd printf("(0)");
1183 1.11 simonb else
1184 1.1 cgd printf("(%d)", (s-r->program)+(next-s));
1185 1.1 cgd s += 3;
1186 1.1 cgd if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1187 1.1 cgd /* Literal string, where present. */
1188 1.1 cgd while (*s != '\0') {
1189 1.1 cgd putchar(*s);
1190 1.1 cgd s++;
1191 1.1 cgd }
1192 1.1 cgd s++;
1193 1.1 cgd }
1194 1.1 cgd putchar('\n');
1195 1.1 cgd }
1196 1.1 cgd
1197 1.1 cgd /* Header fields of interest. */
1198 1.1 cgd if (r->regstart != '\0')
1199 1.1 cgd printf("start `%c' ", r->regstart);
1200 1.1 cgd if (r->reganch)
1201 1.1 cgd printf("anchored ");
1202 1.1 cgd if (r->regmust != NULL)
1203 1.1 cgd printf("must have \"%s\"", r->regmust);
1204 1.1 cgd printf("\n");
1205 1.1 cgd }
1206 1.1 cgd
1207 1.1 cgd /*
1208 1.1 cgd - regprop - printable representation of opcode
1209 1.1 cgd */
1210 1.1 cgd static char *
1211 1.1 cgd regprop(op)
1212 1.1 cgd char *op;
1213 1.1 cgd {
1214 1.8 perry char *p;
1215 1.1 cgd static char buf[50];
1216 1.1 cgd
1217 1.6 mrg (void)strncpy(buf, ":", sizeof(buf) - 1);
1218 1.1 cgd
1219 1.1 cgd switch (OP(op)) {
1220 1.1 cgd case BOL:
1221 1.1 cgd p = "BOL";
1222 1.1 cgd break;
1223 1.1 cgd case EOL:
1224 1.1 cgd p = "EOL";
1225 1.1 cgd break;
1226 1.1 cgd case ANY:
1227 1.1 cgd p = "ANY";
1228 1.1 cgd break;
1229 1.1 cgd case ANYOF:
1230 1.1 cgd p = "ANYOF";
1231 1.1 cgd break;
1232 1.1 cgd case ANYBUT:
1233 1.1 cgd p = "ANYBUT";
1234 1.1 cgd break;
1235 1.1 cgd case BRANCH:
1236 1.1 cgd p = "BRANCH";
1237 1.1 cgd break;
1238 1.1 cgd case EXACTLY:
1239 1.1 cgd p = "EXACTLY";
1240 1.1 cgd break;
1241 1.1 cgd case NOTHING:
1242 1.1 cgd p = "NOTHING";
1243 1.1 cgd break;
1244 1.1 cgd case BACK:
1245 1.1 cgd p = "BACK";
1246 1.1 cgd break;
1247 1.1 cgd case END:
1248 1.1 cgd p = "END";
1249 1.1 cgd break;
1250 1.1 cgd case OPEN+1:
1251 1.1 cgd case OPEN+2:
1252 1.1 cgd case OPEN+3:
1253 1.1 cgd case OPEN+4:
1254 1.1 cgd case OPEN+5:
1255 1.1 cgd case OPEN+6:
1256 1.1 cgd case OPEN+7:
1257 1.1 cgd case OPEN+8:
1258 1.1 cgd case OPEN+9:
1259 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1260 1.6 mrg "OPEN%d", OP(op)-OPEN);
1261 1.1 cgd p = NULL;
1262 1.1 cgd break;
1263 1.1 cgd case CLOSE+1:
1264 1.1 cgd case CLOSE+2:
1265 1.1 cgd case CLOSE+3:
1266 1.1 cgd case CLOSE+4:
1267 1.1 cgd case CLOSE+5:
1268 1.1 cgd case CLOSE+6:
1269 1.1 cgd case CLOSE+7:
1270 1.1 cgd case CLOSE+8:
1271 1.1 cgd case CLOSE+9:
1272 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1273 1.6 mrg "CLOSE%d", OP(op)-CLOSE);
1274 1.1 cgd p = NULL;
1275 1.1 cgd break;
1276 1.1 cgd case STAR:
1277 1.1 cgd p = "STAR";
1278 1.1 cgd break;
1279 1.1 cgd case PLUS:
1280 1.1 cgd p = "PLUS";
1281 1.1 cgd break;
1282 1.1 cgd case WORDA:
1283 1.1 cgd p = "WORDA";
1284 1.1 cgd break;
1285 1.1 cgd case WORDZ:
1286 1.1 cgd p = "WORDZ";
1287 1.1 cgd break;
1288 1.1 cgd default:
1289 1.1 cgd regerror("corrupted opcode");
1290 1.1 cgd break;
1291 1.1 cgd }
1292 1.1 cgd if (p != NULL)
1293 1.6 mrg (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1);
1294 1.1 cgd return(buf);
1295 1.1 cgd }
1296 1.1 cgd #endif
1297 1.1 cgd
1298 1.1 cgd /*
1299 1.1 cgd * The following is provided for those people who do not have strcspn() in
1300 1.1 cgd * their C libraries. They should get off their butts and do something
1301 1.1 cgd * about it; at least one public-domain implementation of those (highly
1302 1.1 cgd * useful) string routines has been published on Usenet.
1303 1.1 cgd */
1304 1.1 cgd #ifdef STRCSPN
1305 1.1 cgd /*
1306 1.1 cgd * strcspn - find length of initial segment of s1 consisting entirely
1307 1.1 cgd * of characters not from s2
1308 1.1 cgd */
1309 1.1 cgd
1310 1.1 cgd static int
1311 1.1 cgd strcspn(s1, s2)
1312 1.1 cgd char *s1;
1313 1.1 cgd char *s2;
1314 1.1 cgd {
1315 1.8 perry char *scan1;
1316 1.8 perry char *scan2;
1317 1.8 perry int count;
1318 1.1 cgd
1319 1.1 cgd count = 0;
1320 1.1 cgd for (scan1 = s1; *scan1 != '\0'; scan1++) {
1321 1.1 cgd for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1322 1.1 cgd if (*scan1 == *scan2++)
1323 1.1 cgd return(count);
1324 1.1 cgd count++;
1325 1.1 cgd }
1326 1.1 cgd return(count);
1327 1.1 cgd }
1328 1.1 cgd #endif
1329