regexp.c revision 1.6 1 1.1 cgd /*
2 1.1 cgd * regcomp and regexec -- regsub and regerror are elsewhere
3 1.1 cgd *
4 1.1 cgd * Copyright (c) 1986 by University of Toronto.
5 1.1 cgd * Written by Henry Spencer. Not derived from licensed software.
6 1.1 cgd *
7 1.1 cgd * Permission is granted to anyone to use this software for any
8 1.1 cgd * purpose on any computer system, and to redistribute it freely,
9 1.1 cgd * subject to the following restrictions:
10 1.1 cgd *
11 1.1 cgd * 1. The author is not responsible for the consequences of use of
12 1.1 cgd * this software, no matter how awful, even if they arise
13 1.1 cgd * from defects in it.
14 1.1 cgd *
15 1.1 cgd * 2. The origin of this software must not be misrepresented, either
16 1.1 cgd * by explicit claim or by omission.
17 1.1 cgd *
18 1.1 cgd * 3. Altered versions must be plainly marked as such, and must not
19 1.1 cgd * be misrepresented as being the original software.
20 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 1.1 cgd *** to assist in implementing egrep.
23 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 1.1 cgd *** as in BSD grep and ex.
26 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 1.1 cgd *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 1.1 cgd *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 1.1 cgd *
31 1.1 cgd * Beware that some of this code is subtly aware of the way operator
32 1.1 cgd * precedence is structured in regular expressions. Serious changes in
33 1.1 cgd * regular-expression syntax might require a total rethink.
34 1.1 cgd */
35 1.2 mycroft
36 1.2 mycroft #ifndef lint
37 1.6 mrg static char *rcsid = "$Id: regexp.c,v 1.6 1997/01/23 14:02:39 mrg Exp $";
38 1.2 mycroft #endif /* not lint */
39 1.2 mycroft
40 1.1 cgd #include <regexp.h>
41 1.1 cgd #include <stdio.h>
42 1.1 cgd #include <ctype.h>
43 1.1 cgd #include <stdlib.h>
44 1.1 cgd #include <string.h>
45 1.1 cgd #include "regmagic.h"
46 1.1 cgd
47 1.1 cgd /*
48 1.1 cgd * The "internal use only" fields in regexp.h are present to pass info from
49 1.1 cgd * compile to execute that permits the execute phase to run lots faster on
50 1.1 cgd * simple cases. They are:
51 1.1 cgd *
52 1.1 cgd * regstart char that must begin a match; '\0' if none obvious
53 1.1 cgd * reganch is the match anchored (at beginning-of-line only)?
54 1.1 cgd * regmust string (pointer into program) that match must include, or NULL
55 1.1 cgd * regmlen length of regmust string
56 1.1 cgd *
57 1.1 cgd * Regstart and reganch permit very fast decisions on suitable starting points
58 1.1 cgd * for a match, cutting down the work a lot. Regmust permits fast rejection
59 1.1 cgd * of lines that cannot possibly match. The regmust tests are costly enough
60 1.1 cgd * that regcomp() supplies a regmust only if the r.e. contains something
61 1.1 cgd * potentially expensive (at present, the only such thing detected is * or +
62 1.1 cgd * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 1.1 cgd * supplied because the test in regexec() needs it and regcomp() is computing
64 1.1 cgd * it anyway.
65 1.1 cgd */
66 1.1 cgd
67 1.1 cgd /*
68 1.1 cgd * Structure for regexp "program". This is essentially a linear encoding
69 1.1 cgd * of a nondeterministic finite-state machine (aka syntax charts or
70 1.1 cgd * "railroad normal form" in parsing technology). Each node is an opcode
71 1.1 cgd * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 1.1 cgd * all nodes except BRANCH implement concatenation; a "next" pointer with
73 1.1 cgd * a BRANCH on both ends of it is connecting two alternatives. (Here we
74 1.1 cgd * have one of the subtle syntax dependencies: an individual BRANCH (as
75 1.1 cgd * opposed to a collection of them) is never concatenated with anything
76 1.1 cgd * because of operator precedence.) The operand of some types of node is
77 1.1 cgd * a literal string; for others, it is a node leading into a sub-FSM. In
78 1.1 cgd * particular, the operand of a BRANCH node is the first node of the branch.
79 1.1 cgd * (NB this is *not* a tree structure: the tail of the branch connects
80 1.1 cgd * to the thing following the set of BRANCHes.) The opcodes are:
81 1.1 cgd */
82 1.1 cgd
83 1.1 cgd /* definition number opnd? meaning */
84 1.1 cgd #define END 0 /* no End of program. */
85 1.1 cgd #define BOL 1 /* no Match "" at beginning of line. */
86 1.1 cgd #define EOL 2 /* no Match "" at end of line. */
87 1.1 cgd #define ANY 3 /* no Match any one character. */
88 1.1 cgd #define ANYOF 4 /* str Match any character in this string. */
89 1.1 cgd #define ANYBUT 5 /* str Match any character not in this string. */
90 1.1 cgd #define BRANCH 6 /* node Match this alternative, or the next... */
91 1.1 cgd #define BACK 7 /* no Match "", "next" ptr points backward. */
92 1.1 cgd #define EXACTLY 8 /* str Match this string. */
93 1.1 cgd #define NOTHING 9 /* no Match empty string. */
94 1.1 cgd #define STAR 10 /* node Match this (simple) thing 0 or more times. */
95 1.1 cgd #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
96 1.1 cgd #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
97 1.1 cgd #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
98 1.1 cgd #define OPEN 20 /* no Mark this point in input as start of #n. */
99 1.1 cgd /* OPEN+1 is number 1, etc. */
100 1.1 cgd #define CLOSE 30 /* no Analogous to OPEN. */
101 1.1 cgd
102 1.1 cgd /*
103 1.1 cgd * Opcode notes:
104 1.1 cgd *
105 1.1 cgd * BRANCH The set of branches constituting a single choice are hooked
106 1.1 cgd * together with their "next" pointers, since precedence prevents
107 1.1 cgd * anything being concatenated to any individual branch. The
108 1.1 cgd * "next" pointer of the last BRANCH in a choice points to the
109 1.1 cgd * thing following the whole choice. This is also where the
110 1.1 cgd * final "next" pointer of each individual branch points; each
111 1.1 cgd * branch starts with the operand node of a BRANCH node.
112 1.1 cgd *
113 1.1 cgd * BACK Normal "next" pointers all implicitly point forward; BACK
114 1.1 cgd * exists to make loop structures possible.
115 1.1 cgd *
116 1.1 cgd * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
117 1.1 cgd * BRANCH structures using BACK. Simple cases (one character
118 1.1 cgd * per match) are implemented with STAR and PLUS for speed
119 1.1 cgd * and to minimize recursive plunges.
120 1.1 cgd *
121 1.1 cgd * OPEN,CLOSE ...are numbered at compile time.
122 1.1 cgd */
123 1.1 cgd
124 1.1 cgd /*
125 1.1 cgd * A node is one char of opcode followed by two chars of "next" pointer.
126 1.1 cgd * "Next" pointers are stored as two 8-bit pieces, high order first. The
127 1.1 cgd * value is a positive offset from the opcode of the node containing it.
128 1.1 cgd * An operand, if any, simply follows the node. (Note that much of the
129 1.1 cgd * code generation knows about this implicit relationship.)
130 1.1 cgd *
131 1.1 cgd * Using two bytes for the "next" pointer is vast overkill for most things,
132 1.1 cgd * but allows patterns to get big without disasters.
133 1.1 cgd */
134 1.1 cgd #define OP(p) (*(p))
135 1.1 cgd #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
136 1.1 cgd #define OPERAND(p) ((p) + 3)
137 1.1 cgd
138 1.1 cgd /*
139 1.1 cgd * See regmagic.h for one further detail of program structure.
140 1.1 cgd */
141 1.1 cgd
142 1.1 cgd
143 1.1 cgd /*
144 1.1 cgd * Utility definitions.
145 1.1 cgd */
146 1.1 cgd #ifndef CHARBITS
147 1.1 cgd #define UCHARAT(p) ((int)*(unsigned char *)(p))
148 1.1 cgd #else
149 1.1 cgd #define UCHARAT(p) ((int)*(p)&CHARBITS)
150 1.1 cgd #endif
151 1.1 cgd
152 1.1 cgd #define FAIL(m) { regerror(m); return(NULL); }
153 1.1 cgd #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 1.1 cgd
155 1.1 cgd /*
156 1.1 cgd * Flags to be passed up and down.
157 1.1 cgd */
158 1.1 cgd #define HASWIDTH 01 /* Known never to match null string. */
159 1.1 cgd #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
160 1.1 cgd #define SPSTART 04 /* Starts with * or +. */
161 1.1 cgd #define WORST 0 /* Worst case. */
162 1.1 cgd
163 1.1 cgd /*
164 1.1 cgd * Global work variables for regcomp().
165 1.1 cgd */
166 1.1 cgd static char *regparse; /* Input-scan pointer. */
167 1.1 cgd static int regnpar; /* () count. */
168 1.1 cgd static char regdummy;
169 1.1 cgd static char *regcode; /* Code-emit pointer; ®dummy = don't. */
170 1.1 cgd static long regsize; /* Code size. */
171 1.1 cgd
172 1.1 cgd /*
173 1.1 cgd * Forward declarations for regcomp()'s friends.
174 1.1 cgd */
175 1.1 cgd #ifndef STATIC
176 1.1 cgd #define STATIC static
177 1.1 cgd #endif
178 1.4 pk STATIC char *reg __P((int, int *));
179 1.4 pk STATIC char *regbranch __P((int *));
180 1.4 pk STATIC char *regpiece __P((int *));
181 1.4 pk STATIC char *regatom __P((int *));
182 1.4 pk STATIC char *regnode __P((char));
183 1.4 pk STATIC char *regnext __P((char *));
184 1.4 pk STATIC void regc __P((char));
185 1.4 pk STATIC void reginsert __P((char, char *));
186 1.4 pk STATIC void regtail __P((char *, char *));
187 1.4 pk STATIC void regoptail __P((char *, char *));
188 1.1 cgd #ifdef STRCSPN
189 1.4 pk STATIC int strcspn __P((char *, char *));
190 1.1 cgd #endif
191 1.1 cgd
192 1.1 cgd /*
193 1.1 cgd - regcomp - compile a regular expression into internal code
194 1.1 cgd *
195 1.1 cgd * We can't allocate space until we know how big the compiled form will be,
196 1.1 cgd * but we can't compile it (and thus know how big it is) until we've got a
197 1.1 cgd * place to put the code. So we cheat: we compile it twice, once with code
198 1.1 cgd * generation turned off and size counting turned on, and once "for real".
199 1.1 cgd * This also means that we don't allocate space until we are sure that the
200 1.1 cgd * thing really will compile successfully, and we never have to move the
201 1.1 cgd * code and thus invalidate pointers into it. (Note that it has to be in
202 1.1 cgd * one piece because free() must be able to free it all.)
203 1.1 cgd *
204 1.1 cgd * Beware that the optimization-preparation code in here knows about some
205 1.1 cgd * of the structure of the compiled regexp.
206 1.1 cgd */
207 1.1 cgd regexp *
208 1.1 cgd regcomp(exp)
209 1.1 cgd const char *exp;
210 1.1 cgd {
211 1.1 cgd register regexp *r;
212 1.1 cgd register char *scan;
213 1.1 cgd register char *longest;
214 1.1 cgd register int len;
215 1.1 cgd int flags;
216 1.1 cgd
217 1.1 cgd if (exp == NULL)
218 1.1 cgd FAIL("NULL argument");
219 1.1 cgd
220 1.1 cgd /* First pass: determine size, legality. */
221 1.1 cgd #ifdef notdef
222 1.1 cgd if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
223 1.1 cgd #endif
224 1.1 cgd regparse = (char *)exp;
225 1.1 cgd regnpar = 1;
226 1.1 cgd regsize = 0L;
227 1.1 cgd regcode = ®dummy;
228 1.1 cgd regc(MAGIC);
229 1.1 cgd if (reg(0, &flags) == NULL)
230 1.1 cgd return(NULL);
231 1.1 cgd
232 1.1 cgd /* Small enough for pointer-storage convention? */
233 1.1 cgd if (regsize >= 32767L) /* Probably could be 65535L. */
234 1.1 cgd FAIL("regexp too big");
235 1.1 cgd
236 1.1 cgd /* Allocate space. */
237 1.1 cgd r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
238 1.1 cgd if (r == NULL)
239 1.1 cgd FAIL("out of space");
240 1.1 cgd
241 1.1 cgd /* Second pass: emit code. */
242 1.1 cgd regparse = (char *)exp;
243 1.1 cgd regnpar = 1;
244 1.1 cgd regcode = r->program;
245 1.1 cgd regc(MAGIC);
246 1.1 cgd if (reg(0, &flags) == NULL)
247 1.1 cgd return(NULL);
248 1.1 cgd
249 1.1 cgd /* Dig out information for optimizations. */
250 1.1 cgd r->regstart = '\0'; /* Worst-case defaults. */
251 1.1 cgd r->reganch = 0;
252 1.1 cgd r->regmust = NULL;
253 1.1 cgd r->regmlen = 0;
254 1.1 cgd scan = r->program+1; /* First BRANCH. */
255 1.1 cgd if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
256 1.1 cgd scan = OPERAND(scan);
257 1.1 cgd
258 1.1 cgd /* Starting-point info. */
259 1.1 cgd if (OP(scan) == EXACTLY)
260 1.1 cgd r->regstart = *OPERAND(scan);
261 1.1 cgd else if (OP(scan) == BOL)
262 1.1 cgd r->reganch++;
263 1.1 cgd
264 1.1 cgd /*
265 1.1 cgd * If there's something expensive in the r.e., find the
266 1.1 cgd * longest literal string that must appear and make it the
267 1.1 cgd * regmust. Resolve ties in favor of later strings, since
268 1.1 cgd * the regstart check works with the beginning of the r.e.
269 1.1 cgd * and avoiding duplication strengthens checking. Not a
270 1.1 cgd * strong reason, but sufficient in the absence of others.
271 1.1 cgd */
272 1.1 cgd if (flags&SPSTART) {
273 1.1 cgd longest = NULL;
274 1.1 cgd len = 0;
275 1.1 cgd for (; scan != NULL; scan = regnext(scan))
276 1.1 cgd if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
277 1.1 cgd longest = OPERAND(scan);
278 1.1 cgd len = strlen(OPERAND(scan));
279 1.1 cgd }
280 1.1 cgd r->regmust = longest;
281 1.1 cgd r->regmlen = len;
282 1.1 cgd }
283 1.1 cgd }
284 1.1 cgd
285 1.1 cgd return(r);
286 1.1 cgd }
287 1.1 cgd
288 1.1 cgd /*
289 1.1 cgd - reg - regular expression, i.e. main body or parenthesized thing
290 1.1 cgd *
291 1.1 cgd * Caller must absorb opening parenthesis.
292 1.1 cgd *
293 1.1 cgd * Combining parenthesis handling with the base level of regular expression
294 1.1 cgd * is a trifle forced, but the need to tie the tails of the branches to what
295 1.1 cgd * follows makes it hard to avoid.
296 1.1 cgd */
297 1.1 cgd static char *
298 1.1 cgd reg(paren, flagp)
299 1.1 cgd int paren; /* Parenthesized? */
300 1.1 cgd int *flagp;
301 1.1 cgd {
302 1.1 cgd register char *ret;
303 1.1 cgd register char *br;
304 1.1 cgd register char *ender;
305 1.4 pk register int parno = 0;
306 1.1 cgd int flags;
307 1.1 cgd
308 1.1 cgd *flagp = HASWIDTH; /* Tentatively. */
309 1.1 cgd
310 1.1 cgd /* Make an OPEN node, if parenthesized. */
311 1.1 cgd if (paren) {
312 1.1 cgd if (regnpar >= NSUBEXP)
313 1.1 cgd FAIL("too many ()");
314 1.1 cgd parno = regnpar;
315 1.1 cgd regnpar++;
316 1.1 cgd ret = regnode(OPEN+parno);
317 1.1 cgd } else
318 1.1 cgd ret = NULL;
319 1.1 cgd
320 1.1 cgd /* Pick up the branches, linking them together. */
321 1.1 cgd br = regbranch(&flags);
322 1.1 cgd if (br == NULL)
323 1.1 cgd return(NULL);
324 1.1 cgd if (ret != NULL)
325 1.1 cgd regtail(ret, br); /* OPEN -> first. */
326 1.1 cgd else
327 1.1 cgd ret = br;
328 1.1 cgd if (!(flags&HASWIDTH))
329 1.1 cgd *flagp &= ~HASWIDTH;
330 1.1 cgd *flagp |= flags&SPSTART;
331 1.1 cgd while (*regparse == '|' || *regparse == '\n') {
332 1.1 cgd regparse++;
333 1.1 cgd br = regbranch(&flags);
334 1.1 cgd if (br == NULL)
335 1.1 cgd return(NULL);
336 1.1 cgd regtail(ret, br); /* BRANCH -> BRANCH. */
337 1.1 cgd if (!(flags&HASWIDTH))
338 1.1 cgd *flagp &= ~HASWIDTH;
339 1.1 cgd *flagp |= flags&SPSTART;
340 1.1 cgd }
341 1.1 cgd
342 1.1 cgd /* Make a closing node, and hook it on the end. */
343 1.1 cgd ender = regnode((paren) ? CLOSE+parno : END);
344 1.1 cgd regtail(ret, ender);
345 1.1 cgd
346 1.1 cgd /* Hook the tails of the branches to the closing node. */
347 1.1 cgd for (br = ret; br != NULL; br = regnext(br))
348 1.1 cgd regoptail(br, ender);
349 1.1 cgd
350 1.1 cgd /* Check for proper termination. */
351 1.1 cgd if (paren && *regparse++ != ')') {
352 1.1 cgd FAIL("unmatched ()");
353 1.1 cgd } else if (!paren && *regparse != '\0') {
354 1.1 cgd if (*regparse == ')') {
355 1.1 cgd FAIL("unmatched ()");
356 1.1 cgd } else
357 1.1 cgd FAIL("junk on end"); /* "Can't happen". */
358 1.1 cgd /* NOTREACHED */
359 1.1 cgd }
360 1.1 cgd
361 1.1 cgd return(ret);
362 1.1 cgd }
363 1.1 cgd
364 1.1 cgd /*
365 1.1 cgd - regbranch - one alternative of an | operator
366 1.1 cgd *
367 1.1 cgd * Implements the concatenation operator.
368 1.1 cgd */
369 1.1 cgd static char *
370 1.1 cgd regbranch(flagp)
371 1.1 cgd int *flagp;
372 1.1 cgd {
373 1.1 cgd register char *ret;
374 1.1 cgd register char *chain;
375 1.1 cgd register char *latest;
376 1.1 cgd int flags;
377 1.1 cgd
378 1.1 cgd *flagp = WORST; /* Tentatively. */
379 1.1 cgd
380 1.1 cgd ret = regnode(BRANCH);
381 1.1 cgd chain = NULL;
382 1.1 cgd while (*regparse != '\0' && *regparse != ')' &&
383 1.1 cgd *regparse != '\n' && *regparse != '|') {
384 1.1 cgd latest = regpiece(&flags);
385 1.1 cgd if (latest == NULL)
386 1.1 cgd return(NULL);
387 1.1 cgd *flagp |= flags&HASWIDTH;
388 1.1 cgd if (chain == NULL) /* First piece. */
389 1.1 cgd *flagp |= flags&SPSTART;
390 1.1 cgd else
391 1.1 cgd regtail(chain, latest);
392 1.1 cgd chain = latest;
393 1.1 cgd }
394 1.1 cgd if (chain == NULL) /* Loop ran zero times. */
395 1.1 cgd (void) regnode(NOTHING);
396 1.1 cgd
397 1.1 cgd return(ret);
398 1.1 cgd }
399 1.1 cgd
400 1.1 cgd /*
401 1.1 cgd - regpiece - something followed by possible [*+?]
402 1.1 cgd *
403 1.1 cgd * Note that the branching code sequences used for ? and the general cases
404 1.1 cgd * of * and + are somewhat optimized: they use the same NOTHING node as
405 1.1 cgd * both the endmarker for their branch list and the body of the last branch.
406 1.1 cgd * It might seem that this node could be dispensed with entirely, but the
407 1.1 cgd * endmarker role is not redundant.
408 1.1 cgd */
409 1.1 cgd static char *
410 1.1 cgd regpiece(flagp)
411 1.1 cgd int *flagp;
412 1.1 cgd {
413 1.1 cgd register char *ret;
414 1.1 cgd register char op;
415 1.1 cgd register char *next;
416 1.1 cgd int flags;
417 1.1 cgd
418 1.1 cgd ret = regatom(&flags);
419 1.1 cgd if (ret == NULL)
420 1.1 cgd return(NULL);
421 1.1 cgd
422 1.1 cgd op = *regparse;
423 1.1 cgd if (!ISMULT(op)) {
424 1.1 cgd *flagp = flags;
425 1.1 cgd return(ret);
426 1.1 cgd }
427 1.1 cgd
428 1.1 cgd if (!(flags&HASWIDTH) && op != '?')
429 1.1 cgd FAIL("*+ operand could be empty");
430 1.1 cgd *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
431 1.1 cgd
432 1.1 cgd if (op == '*' && (flags&SIMPLE))
433 1.1 cgd reginsert(STAR, ret);
434 1.1 cgd else if (op == '*') {
435 1.1 cgd /* Emit x* as (x&|), where & means "self". */
436 1.1 cgd reginsert(BRANCH, ret); /* Either x */
437 1.1 cgd regoptail(ret, regnode(BACK)); /* and loop */
438 1.1 cgd regoptail(ret, ret); /* back */
439 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
440 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
441 1.1 cgd } else if (op == '+' && (flags&SIMPLE))
442 1.1 cgd reginsert(PLUS, ret);
443 1.1 cgd else if (op == '+') {
444 1.1 cgd /* Emit x+ as x(&|), where & means "self". */
445 1.1 cgd next = regnode(BRANCH); /* Either */
446 1.1 cgd regtail(ret, next);
447 1.1 cgd regtail(regnode(BACK), ret); /* loop back */
448 1.1 cgd regtail(next, regnode(BRANCH)); /* or */
449 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
450 1.1 cgd } else if (op == '?') {
451 1.1 cgd /* Emit x? as (x|) */
452 1.1 cgd reginsert(BRANCH, ret); /* Either x */
453 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
454 1.1 cgd next = regnode(NOTHING); /* null. */
455 1.1 cgd regtail(ret, next);
456 1.1 cgd regoptail(ret, next);
457 1.1 cgd }
458 1.1 cgd regparse++;
459 1.1 cgd if (ISMULT(*regparse))
460 1.1 cgd FAIL("nested *?+");
461 1.1 cgd
462 1.1 cgd return(ret);
463 1.1 cgd }
464 1.1 cgd
465 1.1 cgd /*
466 1.1 cgd - regatom - the lowest level
467 1.1 cgd *
468 1.1 cgd * Optimization: gobbles an entire sequence of ordinary characters so that
469 1.1 cgd * it can turn them into a single node, which is smaller to store and
470 1.1 cgd * faster to run. Backslashed characters are exceptions, each becoming a
471 1.1 cgd * separate node; the code is simpler that way and it's not worth fixing.
472 1.1 cgd */
473 1.1 cgd static char *
474 1.1 cgd regatom(flagp)
475 1.1 cgd int *flagp;
476 1.1 cgd {
477 1.1 cgd register char *ret;
478 1.1 cgd int flags;
479 1.1 cgd
480 1.1 cgd *flagp = WORST; /* Tentatively. */
481 1.1 cgd
482 1.1 cgd switch (*regparse++) {
483 1.1 cgd /* FIXME: these chars only have meaning at beg/end of pat? */
484 1.1 cgd case '^':
485 1.1 cgd ret = regnode(BOL);
486 1.1 cgd break;
487 1.1 cgd case '$':
488 1.1 cgd ret = regnode(EOL);
489 1.1 cgd break;
490 1.1 cgd case '.':
491 1.1 cgd ret = regnode(ANY);
492 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
493 1.1 cgd break;
494 1.1 cgd case '[': {
495 1.1 cgd register int class;
496 1.1 cgd register int classend;
497 1.1 cgd
498 1.1 cgd if (*regparse == '^') { /* Complement of range. */
499 1.1 cgd ret = regnode(ANYBUT);
500 1.1 cgd regparse++;
501 1.1 cgd } else
502 1.1 cgd ret = regnode(ANYOF);
503 1.1 cgd if (*regparse == ']' || *regparse == '-')
504 1.1 cgd regc(*regparse++);
505 1.1 cgd while (*regparse != '\0' && *regparse != ']') {
506 1.1 cgd if (*regparse == '-') {
507 1.1 cgd regparse++;
508 1.1 cgd if (*regparse == ']' || *regparse == '\0')
509 1.1 cgd regc('-');
510 1.1 cgd else {
511 1.1 cgd class = UCHARAT(regparse-2)+1;
512 1.1 cgd classend = UCHARAT(regparse);
513 1.1 cgd if (class > classend+1)
514 1.1 cgd FAIL("invalid [] range");
515 1.1 cgd for (; class <= classend; class++)
516 1.1 cgd regc(class);
517 1.1 cgd regparse++;
518 1.1 cgd }
519 1.1 cgd } else
520 1.1 cgd regc(*regparse++);
521 1.1 cgd }
522 1.1 cgd regc('\0');
523 1.1 cgd if (*regparse != ']')
524 1.1 cgd FAIL("unmatched []");
525 1.1 cgd regparse++;
526 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
527 1.1 cgd }
528 1.1 cgd break;
529 1.1 cgd case '(':
530 1.1 cgd ret = reg(1, &flags);
531 1.1 cgd if (ret == NULL)
532 1.1 cgd return(NULL);
533 1.1 cgd *flagp |= flags&(HASWIDTH|SPSTART);
534 1.1 cgd break;
535 1.1 cgd case '\0':
536 1.1 cgd case '|':
537 1.1 cgd case '\n':
538 1.1 cgd case ')':
539 1.1 cgd FAIL("internal urp"); /* Supposed to be caught earlier. */
540 1.1 cgd break;
541 1.1 cgd case '?':
542 1.1 cgd case '+':
543 1.1 cgd case '*':
544 1.1 cgd FAIL("?+* follows nothing");
545 1.1 cgd break;
546 1.1 cgd case '\\':
547 1.1 cgd switch (*regparse++) {
548 1.1 cgd case '\0':
549 1.1 cgd FAIL("trailing \\");
550 1.1 cgd break;
551 1.1 cgd case '<':
552 1.1 cgd ret = regnode(WORDA);
553 1.1 cgd break;
554 1.1 cgd case '>':
555 1.1 cgd ret = regnode(WORDZ);
556 1.1 cgd break;
557 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
558 1.1 cgd default:
559 1.1 cgd /* Handle general quoted chars in exact-match routine */
560 1.1 cgd goto de_fault;
561 1.1 cgd }
562 1.1 cgd break;
563 1.1 cgd de_fault:
564 1.1 cgd default:
565 1.1 cgd /*
566 1.1 cgd * Encode a string of characters to be matched exactly.
567 1.1 cgd *
568 1.1 cgd * This is a bit tricky due to quoted chars and due to
569 1.1 cgd * '*', '+', and '?' taking the SINGLE char previous
570 1.1 cgd * as their operand.
571 1.1 cgd *
572 1.1 cgd * On entry, the char at regparse[-1] is going to go
573 1.1 cgd * into the string, no matter what it is. (It could be
574 1.1 cgd * following a \ if we are entered from the '\' case.)
575 1.1 cgd *
576 1.1 cgd * Basic idea is to pick up a good char in ch and
577 1.1 cgd * examine the next char. If it's *+? then we twiddle.
578 1.1 cgd * If it's \ then we frozzle. If it's other magic char
579 1.1 cgd * we push ch and terminate the string. If none of the
580 1.1 cgd * above, we push ch on the string and go around again.
581 1.1 cgd *
582 1.1 cgd * regprev is used to remember where "the current char"
583 1.1 cgd * starts in the string, if due to a *+? we need to back
584 1.1 cgd * up and put the current char in a separate, 1-char, string.
585 1.1 cgd * When regprev is NULL, ch is the only char in the
586 1.1 cgd * string; this is used in *+? handling, and in setting
587 1.1 cgd * flags |= SIMPLE at the end.
588 1.1 cgd */
589 1.1 cgd {
590 1.1 cgd char *regprev;
591 1.1 cgd register char ch;
592 1.1 cgd
593 1.1 cgd regparse--; /* Look at cur char */
594 1.1 cgd ret = regnode(EXACTLY);
595 1.1 cgd for ( regprev = 0 ; ; ) {
596 1.1 cgd ch = *regparse++; /* Get current char */
597 1.1 cgd switch (*regparse) { /* look at next one */
598 1.1 cgd
599 1.1 cgd default:
600 1.1 cgd regc(ch); /* Add cur to string */
601 1.1 cgd break;
602 1.1 cgd
603 1.1 cgd case '.': case '[': case '(':
604 1.1 cgd case ')': case '|': case '\n':
605 1.1 cgd case '$': case '^':
606 1.1 cgd case '\0':
607 1.1 cgd /* FIXME, $ and ^ should not always be magic */
608 1.1 cgd magic:
609 1.1 cgd regc(ch); /* dump cur char */
610 1.1 cgd goto done; /* and we are done */
611 1.1 cgd
612 1.1 cgd case '?': case '+': case '*':
613 1.1 cgd if (!regprev) /* If just ch in str, */
614 1.1 cgd goto magic; /* use it */
615 1.1 cgd /* End mult-char string one early */
616 1.1 cgd regparse = regprev; /* Back up parse */
617 1.1 cgd goto done;
618 1.1 cgd
619 1.1 cgd case '\\':
620 1.1 cgd regc(ch); /* Cur char OK */
621 1.1 cgd switch (regparse[1]){ /* Look after \ */
622 1.1 cgd case '\0':
623 1.1 cgd case '<':
624 1.1 cgd case '>':
625 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
626 1.1 cgd goto done; /* Not quoted */
627 1.1 cgd default:
628 1.1 cgd /* Backup point is \, scan * point is after it. */
629 1.1 cgd regprev = regparse;
630 1.1 cgd regparse++;
631 1.1 cgd continue; /* NOT break; */
632 1.1 cgd }
633 1.1 cgd }
634 1.1 cgd regprev = regparse; /* Set backup point */
635 1.1 cgd }
636 1.1 cgd done:
637 1.1 cgd regc('\0');
638 1.1 cgd *flagp |= HASWIDTH;
639 1.1 cgd if (!regprev) /* One char? */
640 1.1 cgd *flagp |= SIMPLE;
641 1.1 cgd }
642 1.1 cgd break;
643 1.1 cgd }
644 1.1 cgd
645 1.1 cgd return(ret);
646 1.1 cgd }
647 1.1 cgd
648 1.1 cgd /*
649 1.1 cgd - regnode - emit a node
650 1.1 cgd */
651 1.1 cgd static char * /* Location. */
652 1.1 cgd regnode(op)
653 1.1 cgd char op;
654 1.1 cgd {
655 1.1 cgd register char *ret;
656 1.1 cgd register char *ptr;
657 1.1 cgd
658 1.1 cgd ret = regcode;
659 1.1 cgd if (ret == ®dummy) {
660 1.1 cgd regsize += 3;
661 1.1 cgd return(ret);
662 1.1 cgd }
663 1.1 cgd
664 1.1 cgd ptr = ret;
665 1.1 cgd *ptr++ = op;
666 1.1 cgd *ptr++ = '\0'; /* Null "next" pointer. */
667 1.1 cgd *ptr++ = '\0';
668 1.1 cgd regcode = ptr;
669 1.1 cgd
670 1.1 cgd return(ret);
671 1.1 cgd }
672 1.1 cgd
673 1.1 cgd /*
674 1.1 cgd - regc - emit (if appropriate) a byte of code
675 1.1 cgd */
676 1.1 cgd static void
677 1.1 cgd regc(b)
678 1.1 cgd char b;
679 1.1 cgd {
680 1.1 cgd if (regcode != ®dummy)
681 1.1 cgd *regcode++ = b;
682 1.1 cgd else
683 1.1 cgd regsize++;
684 1.1 cgd }
685 1.1 cgd
686 1.1 cgd /*
687 1.1 cgd - reginsert - insert an operator in front of already-emitted operand
688 1.1 cgd *
689 1.1 cgd * Means relocating the operand.
690 1.1 cgd */
691 1.1 cgd static void
692 1.1 cgd reginsert(op, opnd)
693 1.1 cgd char op;
694 1.1 cgd char *opnd;
695 1.1 cgd {
696 1.1 cgd register char *src;
697 1.1 cgd register char *dst;
698 1.1 cgd register char *place;
699 1.1 cgd
700 1.1 cgd if (regcode == ®dummy) {
701 1.1 cgd regsize += 3;
702 1.1 cgd return;
703 1.1 cgd }
704 1.1 cgd
705 1.1 cgd src = regcode;
706 1.1 cgd regcode += 3;
707 1.1 cgd dst = regcode;
708 1.1 cgd while (src > opnd)
709 1.1 cgd *--dst = *--src;
710 1.1 cgd
711 1.1 cgd place = opnd; /* Op node, where operand used to be. */
712 1.1 cgd *place++ = op;
713 1.1 cgd *place++ = '\0';
714 1.1 cgd *place++ = '\0';
715 1.1 cgd }
716 1.1 cgd
717 1.1 cgd /*
718 1.1 cgd - regtail - set the next-pointer at the end of a node chain
719 1.1 cgd */
720 1.1 cgd static void
721 1.1 cgd regtail(p, val)
722 1.1 cgd char *p;
723 1.1 cgd char *val;
724 1.1 cgd {
725 1.1 cgd register char *scan;
726 1.1 cgd register char *temp;
727 1.1 cgd register int offset;
728 1.1 cgd
729 1.1 cgd if (p == ®dummy)
730 1.1 cgd return;
731 1.1 cgd
732 1.1 cgd /* Find last node. */
733 1.1 cgd scan = p;
734 1.1 cgd for (;;) {
735 1.1 cgd temp = regnext(scan);
736 1.1 cgd if (temp == NULL)
737 1.1 cgd break;
738 1.1 cgd scan = temp;
739 1.1 cgd }
740 1.1 cgd
741 1.1 cgd if (OP(scan) == BACK)
742 1.1 cgd offset = scan - val;
743 1.1 cgd else
744 1.1 cgd offset = val - scan;
745 1.1 cgd *(scan+1) = (offset>>8)&0377;
746 1.1 cgd *(scan+2) = offset&0377;
747 1.1 cgd }
748 1.1 cgd
749 1.1 cgd /*
750 1.1 cgd - regoptail - regtail on operand of first argument; nop if operandless
751 1.1 cgd */
752 1.1 cgd static void
753 1.1 cgd regoptail(p, val)
754 1.1 cgd char *p;
755 1.1 cgd char *val;
756 1.1 cgd {
757 1.1 cgd /* "Operandless" and "op != BRANCH" are synonymous in practice. */
758 1.1 cgd if (p == NULL || p == ®dummy || OP(p) != BRANCH)
759 1.1 cgd return;
760 1.1 cgd regtail(OPERAND(p), val);
761 1.1 cgd }
762 1.1 cgd
763 1.1 cgd /*
764 1.1 cgd * regexec and friends
765 1.1 cgd */
766 1.1 cgd
767 1.1 cgd /*
768 1.1 cgd * Global work variables for regexec().
769 1.1 cgd */
770 1.1 cgd static char *reginput; /* String-input pointer. */
771 1.1 cgd static char *regbol; /* Beginning of input, for ^ check. */
772 1.1 cgd static char **regstartp; /* Pointer to startp array. */
773 1.1 cgd static char **regendp; /* Ditto for endp. */
774 1.1 cgd
775 1.1 cgd /*
776 1.1 cgd * Forwards.
777 1.1 cgd */
778 1.4 pk STATIC int regtry __P((const regexp *, const char *));
779 1.4 pk STATIC int regmatch __P((char *));
780 1.4 pk STATIC int regrepeat __P((char *));
781 1.1 cgd
782 1.1 cgd #ifdef DEBUG
783 1.1 cgd int regnarrate = 0;
784 1.4 pk void regdump __P((regexp *));
785 1.4 pk STATIC char *regprop __P((char *));
786 1.1 cgd #endif
787 1.1 cgd
788 1.1 cgd /*
789 1.1 cgd - regexec - match a regexp against a string
790 1.1 cgd */
791 1.1 cgd int
792 1.1 cgd regexec(prog, string)
793 1.1 cgd register const regexp *prog;
794 1.1 cgd register const char *string;
795 1.1 cgd {
796 1.1 cgd register char *s;
797 1.1 cgd
798 1.1 cgd /* Be paranoid... */
799 1.1 cgd if (prog == NULL || string == NULL) {
800 1.1 cgd regerror("NULL parameter");
801 1.1 cgd return(0);
802 1.1 cgd }
803 1.1 cgd
804 1.1 cgd /* Check validity of program. */
805 1.1 cgd if (UCHARAT(prog->program) != MAGIC) {
806 1.1 cgd regerror("corrupted program");
807 1.1 cgd return(0);
808 1.1 cgd }
809 1.1 cgd
810 1.1 cgd /* If there is a "must appear" string, look for it. */
811 1.1 cgd if (prog->regmust != NULL) {
812 1.1 cgd s = (char *)string;
813 1.1 cgd while ((s = strchr(s, prog->regmust[0])) != NULL) {
814 1.1 cgd if (strncmp(s, prog->regmust, prog->regmlen) == 0)
815 1.1 cgd break; /* Found it. */
816 1.1 cgd s++;
817 1.1 cgd }
818 1.1 cgd if (s == NULL) /* Not present. */
819 1.1 cgd return(0);
820 1.1 cgd }
821 1.1 cgd
822 1.1 cgd /* Mark beginning of line for ^ . */
823 1.1 cgd regbol = (char *)string;
824 1.1 cgd
825 1.1 cgd /* Simplest case: anchored match need be tried only once. */
826 1.1 cgd if (prog->reganch)
827 1.1 cgd return(regtry(prog, string));
828 1.1 cgd
829 1.1 cgd /* Messy cases: unanchored match. */
830 1.1 cgd s = (char *)string;
831 1.1 cgd if (prog->regstart != '\0')
832 1.1 cgd /* We know what char it must start with. */
833 1.1 cgd while ((s = strchr(s, prog->regstart)) != NULL) {
834 1.1 cgd if (regtry(prog, s))
835 1.1 cgd return(1);
836 1.1 cgd s++;
837 1.1 cgd }
838 1.1 cgd else
839 1.1 cgd /* We don't -- general case. */
840 1.1 cgd do {
841 1.1 cgd if (regtry(prog, s))
842 1.1 cgd return(1);
843 1.1 cgd } while (*s++ != '\0');
844 1.1 cgd
845 1.1 cgd /* Failure. */
846 1.1 cgd return(0);
847 1.1 cgd }
848 1.1 cgd
849 1.1 cgd /*
850 1.1 cgd - regtry - try match at specific point
851 1.1 cgd */
852 1.1 cgd static int /* 0 failure, 1 success */
853 1.1 cgd regtry(prog, string)
854 1.4 pk const regexp *prog;
855 1.4 pk const char *string;
856 1.1 cgd {
857 1.1 cgd register int i;
858 1.1 cgd register char **sp;
859 1.1 cgd register char **ep;
860 1.1 cgd
861 1.5 cgd reginput = (char *)string; /* XXX */
862 1.5 cgd regstartp = (char **)prog->startp; /* XXX */
863 1.5 cgd regendp = (char **)prog->endp; /* XXX */
864 1.1 cgd
865 1.5 cgd sp = (char **)prog->startp; /* XXX */
866 1.5 cgd ep = (char **)prog->endp; /* XXX */
867 1.1 cgd for (i = NSUBEXP; i > 0; i--) {
868 1.1 cgd *sp++ = NULL;
869 1.1 cgd *ep++ = NULL;
870 1.1 cgd }
871 1.5 cgd if (regmatch((char *)prog->program + 1)) { /* XXX */
872 1.5 cgd ((regexp *)prog)->startp[0] = (char *)string; /* XXX */
873 1.5 cgd ((regexp *)prog)->endp[0] = reginput; /* XXX */
874 1.1 cgd return(1);
875 1.1 cgd } else
876 1.1 cgd return(0);
877 1.1 cgd }
878 1.1 cgd
879 1.1 cgd /*
880 1.1 cgd - regmatch - main matching routine
881 1.1 cgd *
882 1.1 cgd * Conceptually the strategy is simple: check to see whether the current
883 1.1 cgd * node matches, call self recursively to see whether the rest matches,
884 1.1 cgd * and then act accordingly. In practice we make some effort to avoid
885 1.1 cgd * recursion, in particular by going through "ordinary" nodes (that don't
886 1.1 cgd * need to know whether the rest of the match failed) by a loop instead of
887 1.1 cgd * by recursion.
888 1.1 cgd */
889 1.1 cgd static int /* 0 failure, 1 success */
890 1.1 cgd regmatch(prog)
891 1.1 cgd char *prog;
892 1.1 cgd {
893 1.1 cgd register char *scan; /* Current node. */
894 1.1 cgd char *next; /* Next node. */
895 1.1 cgd
896 1.1 cgd scan = prog;
897 1.1 cgd #ifdef DEBUG
898 1.1 cgd if (scan != NULL && regnarrate)
899 1.1 cgd fprintf(stderr, "%s(\n", regprop(scan));
900 1.1 cgd #endif
901 1.1 cgd while (scan != NULL) {
902 1.1 cgd #ifdef DEBUG
903 1.1 cgd if (regnarrate)
904 1.1 cgd fprintf(stderr, "%s...\n", regprop(scan));
905 1.1 cgd #endif
906 1.1 cgd next = regnext(scan);
907 1.1 cgd
908 1.1 cgd switch (OP(scan)) {
909 1.1 cgd case BOL:
910 1.1 cgd if (reginput != regbol)
911 1.1 cgd return(0);
912 1.1 cgd break;
913 1.1 cgd case EOL:
914 1.1 cgd if (*reginput != '\0')
915 1.1 cgd return(0);
916 1.1 cgd break;
917 1.1 cgd case WORDA:
918 1.1 cgd /* Must be looking at a letter, digit, or _ */
919 1.1 cgd if ((!isalnum(*reginput)) && *reginput != '_')
920 1.1 cgd return(0);
921 1.1 cgd /* Prev must be BOL or nonword */
922 1.1 cgd if (reginput > regbol &&
923 1.1 cgd (isalnum(reginput[-1]) || reginput[-1] == '_'))
924 1.1 cgd return(0);
925 1.1 cgd break;
926 1.1 cgd case WORDZ:
927 1.1 cgd /* Must be looking at non letter, digit, or _ */
928 1.1 cgd if (isalnum(*reginput) || *reginput == '_')
929 1.1 cgd return(0);
930 1.1 cgd /* We don't care what the previous char was */
931 1.1 cgd break;
932 1.1 cgd case ANY:
933 1.1 cgd if (*reginput == '\0')
934 1.1 cgd return(0);
935 1.1 cgd reginput++;
936 1.1 cgd break;
937 1.1 cgd case EXACTLY: {
938 1.1 cgd register int len;
939 1.1 cgd register char *opnd;
940 1.1 cgd
941 1.1 cgd opnd = OPERAND(scan);
942 1.1 cgd /* Inline the first character, for speed. */
943 1.1 cgd if (*opnd != *reginput)
944 1.1 cgd return(0);
945 1.1 cgd len = strlen(opnd);
946 1.1 cgd if (len > 1 && strncmp(opnd, reginput, len) != 0)
947 1.1 cgd return(0);
948 1.1 cgd reginput += len;
949 1.1 cgd }
950 1.1 cgd break;
951 1.1 cgd case ANYOF:
952 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
953 1.1 cgd return(0);
954 1.1 cgd reginput++;
955 1.1 cgd break;
956 1.1 cgd case ANYBUT:
957 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
958 1.1 cgd return(0);
959 1.1 cgd reginput++;
960 1.1 cgd break;
961 1.1 cgd case NOTHING:
962 1.1 cgd break;
963 1.1 cgd case BACK:
964 1.1 cgd break;
965 1.1 cgd case OPEN+1:
966 1.1 cgd case OPEN+2:
967 1.1 cgd case OPEN+3:
968 1.1 cgd case OPEN+4:
969 1.1 cgd case OPEN+5:
970 1.1 cgd case OPEN+6:
971 1.1 cgd case OPEN+7:
972 1.1 cgd case OPEN+8:
973 1.1 cgd case OPEN+9: {
974 1.1 cgd register int no;
975 1.1 cgd register char *save;
976 1.1 cgd
977 1.1 cgd no = OP(scan) - OPEN;
978 1.1 cgd save = reginput;
979 1.1 cgd
980 1.1 cgd if (regmatch(next)) {
981 1.1 cgd /*
982 1.1 cgd * Don't set startp if some later
983 1.1 cgd * invocation of the same parentheses
984 1.1 cgd * already has.
985 1.1 cgd */
986 1.1 cgd if (regstartp[no] == NULL)
987 1.1 cgd regstartp[no] = save;
988 1.1 cgd return(1);
989 1.1 cgd } else
990 1.1 cgd return(0);
991 1.1 cgd }
992 1.1 cgd break;
993 1.1 cgd case CLOSE+1:
994 1.1 cgd case CLOSE+2:
995 1.1 cgd case CLOSE+3:
996 1.1 cgd case CLOSE+4:
997 1.1 cgd case CLOSE+5:
998 1.1 cgd case CLOSE+6:
999 1.1 cgd case CLOSE+7:
1000 1.1 cgd case CLOSE+8:
1001 1.1 cgd case CLOSE+9: {
1002 1.1 cgd register int no;
1003 1.1 cgd register char *save;
1004 1.1 cgd
1005 1.1 cgd no = OP(scan) - CLOSE;
1006 1.1 cgd save = reginput;
1007 1.1 cgd
1008 1.1 cgd if (regmatch(next)) {
1009 1.1 cgd /*
1010 1.1 cgd * Don't set endp if some later
1011 1.1 cgd * invocation of the same parentheses
1012 1.1 cgd * already has.
1013 1.1 cgd */
1014 1.1 cgd if (regendp[no] == NULL)
1015 1.1 cgd regendp[no] = save;
1016 1.1 cgd return(1);
1017 1.1 cgd } else
1018 1.1 cgd return(0);
1019 1.1 cgd }
1020 1.1 cgd break;
1021 1.1 cgd case BRANCH: {
1022 1.1 cgd register char *save;
1023 1.1 cgd
1024 1.1 cgd if (OP(next) != BRANCH) /* No choice. */
1025 1.1 cgd next = OPERAND(scan); /* Avoid recursion. */
1026 1.1 cgd else {
1027 1.1 cgd do {
1028 1.1 cgd save = reginput;
1029 1.1 cgd if (regmatch(OPERAND(scan)))
1030 1.1 cgd return(1);
1031 1.1 cgd reginput = save;
1032 1.1 cgd scan = regnext(scan);
1033 1.1 cgd } while (scan != NULL && OP(scan) == BRANCH);
1034 1.1 cgd return(0);
1035 1.1 cgd /* NOTREACHED */
1036 1.1 cgd }
1037 1.1 cgd }
1038 1.1 cgd break;
1039 1.1 cgd case STAR:
1040 1.1 cgd case PLUS: {
1041 1.1 cgd register char nextch;
1042 1.1 cgd register int no;
1043 1.1 cgd register char *save;
1044 1.1 cgd register int min;
1045 1.1 cgd
1046 1.1 cgd /*
1047 1.1 cgd * Lookahead to avoid useless match attempts
1048 1.1 cgd * when we know what character comes next.
1049 1.1 cgd */
1050 1.1 cgd nextch = '\0';
1051 1.1 cgd if (OP(next) == EXACTLY)
1052 1.1 cgd nextch = *OPERAND(next);
1053 1.1 cgd min = (OP(scan) == STAR) ? 0 : 1;
1054 1.1 cgd save = reginput;
1055 1.1 cgd no = regrepeat(OPERAND(scan));
1056 1.1 cgd while (no >= min) {
1057 1.1 cgd /* If it could work, try it. */
1058 1.1 cgd if (nextch == '\0' || *reginput == nextch)
1059 1.1 cgd if (regmatch(next))
1060 1.1 cgd return(1);
1061 1.1 cgd /* Couldn't or didn't -- back up. */
1062 1.1 cgd no--;
1063 1.1 cgd reginput = save + no;
1064 1.1 cgd }
1065 1.1 cgd return(0);
1066 1.1 cgd }
1067 1.1 cgd break;
1068 1.1 cgd case END:
1069 1.1 cgd return(1); /* Success! */
1070 1.1 cgd break;
1071 1.1 cgd default:
1072 1.1 cgd regerror("memory corruption");
1073 1.1 cgd return(0);
1074 1.1 cgd break;
1075 1.1 cgd }
1076 1.1 cgd
1077 1.1 cgd scan = next;
1078 1.1 cgd }
1079 1.1 cgd
1080 1.1 cgd /*
1081 1.1 cgd * We get here only if there's trouble -- normally "case END" is
1082 1.1 cgd * the terminating point.
1083 1.1 cgd */
1084 1.1 cgd regerror("corrupted pointers");
1085 1.1 cgd return(0);
1086 1.1 cgd }
1087 1.1 cgd
1088 1.1 cgd /*
1089 1.1 cgd - regrepeat - repeatedly match something simple, report how many
1090 1.1 cgd */
1091 1.1 cgd static int
1092 1.1 cgd regrepeat(p)
1093 1.1 cgd char *p;
1094 1.1 cgd {
1095 1.1 cgd register int count = 0;
1096 1.1 cgd register char *scan;
1097 1.1 cgd register char *opnd;
1098 1.1 cgd
1099 1.1 cgd scan = reginput;
1100 1.1 cgd opnd = OPERAND(p);
1101 1.1 cgd switch (OP(p)) {
1102 1.1 cgd case ANY:
1103 1.1 cgd count = strlen(scan);
1104 1.1 cgd scan += count;
1105 1.1 cgd break;
1106 1.1 cgd case EXACTLY:
1107 1.1 cgd while (*opnd == *scan) {
1108 1.1 cgd count++;
1109 1.1 cgd scan++;
1110 1.1 cgd }
1111 1.1 cgd break;
1112 1.1 cgd case ANYOF:
1113 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1114 1.1 cgd count++;
1115 1.1 cgd scan++;
1116 1.1 cgd }
1117 1.1 cgd break;
1118 1.1 cgd case ANYBUT:
1119 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1120 1.1 cgd count++;
1121 1.1 cgd scan++;
1122 1.1 cgd }
1123 1.1 cgd break;
1124 1.1 cgd default: /* Oh dear. Called inappropriately. */
1125 1.1 cgd regerror("internal foulup");
1126 1.1 cgd count = 0; /* Best compromise. */
1127 1.1 cgd break;
1128 1.1 cgd }
1129 1.1 cgd reginput = scan;
1130 1.1 cgd
1131 1.1 cgd return(count);
1132 1.1 cgd }
1133 1.1 cgd
1134 1.1 cgd /*
1135 1.1 cgd - regnext - dig the "next" pointer out of a node
1136 1.1 cgd */
1137 1.1 cgd static char *
1138 1.1 cgd regnext(p)
1139 1.1 cgd register char *p;
1140 1.1 cgd {
1141 1.1 cgd register int offset;
1142 1.1 cgd
1143 1.1 cgd if (p == ®dummy)
1144 1.1 cgd return(NULL);
1145 1.1 cgd
1146 1.1 cgd offset = NEXT(p);
1147 1.1 cgd if (offset == 0)
1148 1.1 cgd return(NULL);
1149 1.1 cgd
1150 1.1 cgd if (OP(p) == BACK)
1151 1.1 cgd return(p-offset);
1152 1.1 cgd else
1153 1.1 cgd return(p+offset);
1154 1.1 cgd }
1155 1.1 cgd
1156 1.1 cgd #ifdef DEBUG
1157 1.1 cgd
1158 1.1 cgd /*
1159 1.1 cgd - regdump - dump a regexp onto stdout in vaguely comprehensible form
1160 1.1 cgd */
1161 1.1 cgd void
1162 1.1 cgd regdump(r)
1163 1.1 cgd regexp *r;
1164 1.1 cgd {
1165 1.1 cgd register char *s;
1166 1.1 cgd register char op = EXACTLY; /* Arbitrary non-END op. */
1167 1.1 cgd register char *next;
1168 1.1 cgd extern char *strchr();
1169 1.1 cgd
1170 1.1 cgd
1171 1.1 cgd s = r->program + 1;
1172 1.1 cgd while (op != END) { /* While that wasn't END last time... */
1173 1.1 cgd op = OP(s);
1174 1.1 cgd printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1175 1.1 cgd next = regnext(s);
1176 1.1 cgd if (next == NULL) /* Next ptr. */
1177 1.1 cgd printf("(0)");
1178 1.1 cgd else
1179 1.1 cgd printf("(%d)", (s-r->program)+(next-s));
1180 1.1 cgd s += 3;
1181 1.1 cgd if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1182 1.1 cgd /* Literal string, where present. */
1183 1.1 cgd while (*s != '\0') {
1184 1.1 cgd putchar(*s);
1185 1.1 cgd s++;
1186 1.1 cgd }
1187 1.1 cgd s++;
1188 1.1 cgd }
1189 1.1 cgd putchar('\n');
1190 1.1 cgd }
1191 1.1 cgd
1192 1.1 cgd /* Header fields of interest. */
1193 1.1 cgd if (r->regstart != '\0')
1194 1.1 cgd printf("start `%c' ", r->regstart);
1195 1.1 cgd if (r->reganch)
1196 1.1 cgd printf("anchored ");
1197 1.1 cgd if (r->regmust != NULL)
1198 1.1 cgd printf("must have \"%s\"", r->regmust);
1199 1.1 cgd printf("\n");
1200 1.1 cgd }
1201 1.1 cgd
1202 1.1 cgd /*
1203 1.1 cgd - regprop - printable representation of opcode
1204 1.1 cgd */
1205 1.1 cgd static char *
1206 1.1 cgd regprop(op)
1207 1.1 cgd char *op;
1208 1.1 cgd {
1209 1.1 cgd register char *p;
1210 1.1 cgd static char buf[50];
1211 1.1 cgd
1212 1.6 mrg (void)strncpy(buf, ":", sizeof(buf) - 1);
1213 1.1 cgd
1214 1.1 cgd switch (OP(op)) {
1215 1.1 cgd case BOL:
1216 1.1 cgd p = "BOL";
1217 1.1 cgd break;
1218 1.1 cgd case EOL:
1219 1.1 cgd p = "EOL";
1220 1.1 cgd break;
1221 1.1 cgd case ANY:
1222 1.1 cgd p = "ANY";
1223 1.1 cgd break;
1224 1.1 cgd case ANYOF:
1225 1.1 cgd p = "ANYOF";
1226 1.1 cgd break;
1227 1.1 cgd case ANYBUT:
1228 1.1 cgd p = "ANYBUT";
1229 1.1 cgd break;
1230 1.1 cgd case BRANCH:
1231 1.1 cgd p = "BRANCH";
1232 1.1 cgd break;
1233 1.1 cgd case EXACTLY:
1234 1.1 cgd p = "EXACTLY";
1235 1.1 cgd break;
1236 1.1 cgd case NOTHING:
1237 1.1 cgd p = "NOTHING";
1238 1.1 cgd break;
1239 1.1 cgd case BACK:
1240 1.1 cgd p = "BACK";
1241 1.1 cgd break;
1242 1.1 cgd case END:
1243 1.1 cgd p = "END";
1244 1.1 cgd break;
1245 1.1 cgd case OPEN+1:
1246 1.1 cgd case OPEN+2:
1247 1.1 cgd case OPEN+3:
1248 1.1 cgd case OPEN+4:
1249 1.1 cgd case OPEN+5:
1250 1.1 cgd case OPEN+6:
1251 1.1 cgd case OPEN+7:
1252 1.1 cgd case OPEN+8:
1253 1.1 cgd case OPEN+9:
1254 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1255 1.6 mrg "OPEN%d", OP(op)-OPEN);
1256 1.1 cgd p = NULL;
1257 1.1 cgd break;
1258 1.1 cgd case CLOSE+1:
1259 1.1 cgd case CLOSE+2:
1260 1.1 cgd case CLOSE+3:
1261 1.1 cgd case CLOSE+4:
1262 1.1 cgd case CLOSE+5:
1263 1.1 cgd case CLOSE+6:
1264 1.1 cgd case CLOSE+7:
1265 1.1 cgd case CLOSE+8:
1266 1.1 cgd case CLOSE+9:
1267 1.6 mrg (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1268 1.6 mrg "CLOSE%d", OP(op)-CLOSE);
1269 1.1 cgd p = NULL;
1270 1.1 cgd break;
1271 1.1 cgd case STAR:
1272 1.1 cgd p = "STAR";
1273 1.1 cgd break;
1274 1.1 cgd case PLUS:
1275 1.1 cgd p = "PLUS";
1276 1.1 cgd break;
1277 1.1 cgd case WORDA:
1278 1.1 cgd p = "WORDA";
1279 1.1 cgd break;
1280 1.1 cgd case WORDZ:
1281 1.1 cgd p = "WORDZ";
1282 1.1 cgd break;
1283 1.1 cgd default:
1284 1.1 cgd regerror("corrupted opcode");
1285 1.1 cgd break;
1286 1.1 cgd }
1287 1.1 cgd if (p != NULL)
1288 1.6 mrg (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1);
1289 1.1 cgd return(buf);
1290 1.1 cgd }
1291 1.1 cgd #endif
1292 1.1 cgd
1293 1.1 cgd /*
1294 1.1 cgd * The following is provided for those people who do not have strcspn() in
1295 1.1 cgd * their C libraries. They should get off their butts and do something
1296 1.1 cgd * about it; at least one public-domain implementation of those (highly
1297 1.1 cgd * useful) string routines has been published on Usenet.
1298 1.1 cgd */
1299 1.1 cgd #ifdef STRCSPN
1300 1.1 cgd /*
1301 1.1 cgd * strcspn - find length of initial segment of s1 consisting entirely
1302 1.1 cgd * of characters not from s2
1303 1.1 cgd */
1304 1.1 cgd
1305 1.1 cgd static int
1306 1.1 cgd strcspn(s1, s2)
1307 1.1 cgd char *s1;
1308 1.1 cgd char *s2;
1309 1.1 cgd {
1310 1.1 cgd register char *scan1;
1311 1.1 cgd register char *scan2;
1312 1.1 cgd register int count;
1313 1.1 cgd
1314 1.1 cgd count = 0;
1315 1.1 cgd for (scan1 = s1; *scan1 != '\0'; scan1++) {
1316 1.1 cgd for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1317 1.1 cgd if (*scan1 == *scan2++)
1318 1.1 cgd return(count);
1319 1.1 cgd count++;
1320 1.1 cgd }
1321 1.1 cgd return(count);
1322 1.1 cgd }
1323 1.1 cgd #endif
1324