regexp.c revision 1.1 1 1.1 cgd /*
2 1.1 cgd * regcomp and regexec -- regsub and regerror are elsewhere
3 1.1 cgd *
4 1.1 cgd * Copyright (c) 1986 by University of Toronto.
5 1.1 cgd * Written by Henry Spencer. Not derived from licensed software.
6 1.1 cgd *
7 1.1 cgd * Permission is granted to anyone to use this software for any
8 1.1 cgd * purpose on any computer system, and to redistribute it freely,
9 1.1 cgd * subject to the following restrictions:
10 1.1 cgd *
11 1.1 cgd * 1. The author is not responsible for the consequences of use of
12 1.1 cgd * this software, no matter how awful, even if they arise
13 1.1 cgd * from defects in it.
14 1.1 cgd *
15 1.1 cgd * 2. The origin of this software must not be misrepresented, either
16 1.1 cgd * by explicit claim or by omission.
17 1.1 cgd *
18 1.1 cgd * 3. Altered versions must be plainly marked as such, and must not
19 1.1 cgd * be misrepresented as being the original software.
20 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 1.1 cgd *** to assist in implementing egrep.
23 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 1.1 cgd *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 1.1 cgd *** as in BSD grep and ex.
26 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 1.1 cgd *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 1.1 cgd *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 1.1 cgd *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 1.1 cgd *
31 1.1 cgd * Beware that some of this code is subtly aware of the way operator
32 1.1 cgd * precedence is structured in regular expressions. Serious changes in
33 1.1 cgd * regular-expression syntax might require a total rethink.
34 1.1 cgd */
35 1.1 cgd #include <regexp.h>
36 1.1 cgd #include <stdio.h>
37 1.1 cgd #include <ctype.h>
38 1.1 cgd #include <stdlib.h>
39 1.1 cgd #include <string.h>
40 1.1 cgd #include "regmagic.h"
41 1.1 cgd
42 1.1 cgd /*
43 1.1 cgd * The "internal use only" fields in regexp.h are present to pass info from
44 1.1 cgd * compile to execute that permits the execute phase to run lots faster on
45 1.1 cgd * simple cases. They are:
46 1.1 cgd *
47 1.1 cgd * regstart char that must begin a match; '\0' if none obvious
48 1.1 cgd * reganch is the match anchored (at beginning-of-line only)?
49 1.1 cgd * regmust string (pointer into program) that match must include, or NULL
50 1.1 cgd * regmlen length of regmust string
51 1.1 cgd *
52 1.1 cgd * Regstart and reganch permit very fast decisions on suitable starting points
53 1.1 cgd * for a match, cutting down the work a lot. Regmust permits fast rejection
54 1.1 cgd * of lines that cannot possibly match. The regmust tests are costly enough
55 1.1 cgd * that regcomp() supplies a regmust only if the r.e. contains something
56 1.1 cgd * potentially expensive (at present, the only such thing detected is * or +
57 1.1 cgd * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 1.1 cgd * supplied because the test in regexec() needs it and regcomp() is computing
59 1.1 cgd * it anyway.
60 1.1 cgd */
61 1.1 cgd
62 1.1 cgd /*
63 1.1 cgd * Structure for regexp "program". This is essentially a linear encoding
64 1.1 cgd * of a nondeterministic finite-state machine (aka syntax charts or
65 1.1 cgd * "railroad normal form" in parsing technology). Each node is an opcode
66 1.1 cgd * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 1.1 cgd * all nodes except BRANCH implement concatenation; a "next" pointer with
68 1.1 cgd * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 1.1 cgd * have one of the subtle syntax dependencies: an individual BRANCH (as
70 1.1 cgd * opposed to a collection of them) is never concatenated with anything
71 1.1 cgd * because of operator precedence.) The operand of some types of node is
72 1.1 cgd * a literal string; for others, it is a node leading into a sub-FSM. In
73 1.1 cgd * particular, the operand of a BRANCH node is the first node of the branch.
74 1.1 cgd * (NB this is *not* a tree structure: the tail of the branch connects
75 1.1 cgd * to the thing following the set of BRANCHes.) The opcodes are:
76 1.1 cgd */
77 1.1 cgd
78 1.1 cgd /* definition number opnd? meaning */
79 1.1 cgd #define END 0 /* no End of program. */
80 1.1 cgd #define BOL 1 /* no Match "" at beginning of line. */
81 1.1 cgd #define EOL 2 /* no Match "" at end of line. */
82 1.1 cgd #define ANY 3 /* no Match any one character. */
83 1.1 cgd #define ANYOF 4 /* str Match any character in this string. */
84 1.1 cgd #define ANYBUT 5 /* str Match any character not in this string. */
85 1.1 cgd #define BRANCH 6 /* node Match this alternative, or the next... */
86 1.1 cgd #define BACK 7 /* no Match "", "next" ptr points backward. */
87 1.1 cgd #define EXACTLY 8 /* str Match this string. */
88 1.1 cgd #define NOTHING 9 /* no Match empty string. */
89 1.1 cgd #define STAR 10 /* node Match this (simple) thing 0 or more times. */
90 1.1 cgd #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
91 1.1 cgd #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
92 1.1 cgd #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
93 1.1 cgd #define OPEN 20 /* no Mark this point in input as start of #n. */
94 1.1 cgd /* OPEN+1 is number 1, etc. */
95 1.1 cgd #define CLOSE 30 /* no Analogous to OPEN. */
96 1.1 cgd
97 1.1 cgd /*
98 1.1 cgd * Opcode notes:
99 1.1 cgd *
100 1.1 cgd * BRANCH The set of branches constituting a single choice are hooked
101 1.1 cgd * together with their "next" pointers, since precedence prevents
102 1.1 cgd * anything being concatenated to any individual branch. The
103 1.1 cgd * "next" pointer of the last BRANCH in a choice points to the
104 1.1 cgd * thing following the whole choice. This is also where the
105 1.1 cgd * final "next" pointer of each individual branch points; each
106 1.1 cgd * branch starts with the operand node of a BRANCH node.
107 1.1 cgd *
108 1.1 cgd * BACK Normal "next" pointers all implicitly point forward; BACK
109 1.1 cgd * exists to make loop structures possible.
110 1.1 cgd *
111 1.1 cgd * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
112 1.1 cgd * BRANCH structures using BACK. Simple cases (one character
113 1.1 cgd * per match) are implemented with STAR and PLUS for speed
114 1.1 cgd * and to minimize recursive plunges.
115 1.1 cgd *
116 1.1 cgd * OPEN,CLOSE ...are numbered at compile time.
117 1.1 cgd */
118 1.1 cgd
119 1.1 cgd /*
120 1.1 cgd * A node is one char of opcode followed by two chars of "next" pointer.
121 1.1 cgd * "Next" pointers are stored as two 8-bit pieces, high order first. The
122 1.1 cgd * value is a positive offset from the opcode of the node containing it.
123 1.1 cgd * An operand, if any, simply follows the node. (Note that much of the
124 1.1 cgd * code generation knows about this implicit relationship.)
125 1.1 cgd *
126 1.1 cgd * Using two bytes for the "next" pointer is vast overkill for most things,
127 1.1 cgd * but allows patterns to get big without disasters.
128 1.1 cgd */
129 1.1 cgd #define OP(p) (*(p))
130 1.1 cgd #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
131 1.1 cgd #define OPERAND(p) ((p) + 3)
132 1.1 cgd
133 1.1 cgd /*
134 1.1 cgd * See regmagic.h for one further detail of program structure.
135 1.1 cgd */
136 1.1 cgd
137 1.1 cgd
138 1.1 cgd /*
139 1.1 cgd * Utility definitions.
140 1.1 cgd */
141 1.1 cgd #ifndef CHARBITS
142 1.1 cgd #define UCHARAT(p) ((int)*(unsigned char *)(p))
143 1.1 cgd #else
144 1.1 cgd #define UCHARAT(p) ((int)*(p)&CHARBITS)
145 1.1 cgd #endif
146 1.1 cgd
147 1.1 cgd #define FAIL(m) { regerror(m); return(NULL); }
148 1.1 cgd #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
149 1.1 cgd
150 1.1 cgd /*
151 1.1 cgd * Flags to be passed up and down.
152 1.1 cgd */
153 1.1 cgd #define HASWIDTH 01 /* Known never to match null string. */
154 1.1 cgd #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155 1.1 cgd #define SPSTART 04 /* Starts with * or +. */
156 1.1 cgd #define WORST 0 /* Worst case. */
157 1.1 cgd
158 1.1 cgd /*
159 1.1 cgd * Global work variables for regcomp().
160 1.1 cgd */
161 1.1 cgd static char *regparse; /* Input-scan pointer. */
162 1.1 cgd static int regnpar; /* () count. */
163 1.1 cgd static char regdummy;
164 1.1 cgd static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165 1.1 cgd static long regsize; /* Code size. */
166 1.1 cgd
167 1.1 cgd /*
168 1.1 cgd * Forward declarations for regcomp()'s friends.
169 1.1 cgd */
170 1.1 cgd #ifndef STATIC
171 1.1 cgd #define STATIC static
172 1.1 cgd #endif
173 1.1 cgd STATIC char *reg();
174 1.1 cgd STATIC char *regbranch();
175 1.1 cgd STATIC char *regpiece();
176 1.1 cgd STATIC char *regatom();
177 1.1 cgd STATIC char *regnode();
178 1.1 cgd STATIC char *regnext();
179 1.1 cgd STATIC void regc();
180 1.1 cgd STATIC void reginsert();
181 1.1 cgd STATIC void regtail();
182 1.1 cgd STATIC void regoptail();
183 1.1 cgd #ifdef STRCSPN
184 1.1 cgd STATIC int strcspn();
185 1.1 cgd #endif
186 1.1 cgd
187 1.1 cgd /*
188 1.1 cgd - regcomp - compile a regular expression into internal code
189 1.1 cgd *
190 1.1 cgd * We can't allocate space until we know how big the compiled form will be,
191 1.1 cgd * but we can't compile it (and thus know how big it is) until we've got a
192 1.1 cgd * place to put the code. So we cheat: we compile it twice, once with code
193 1.1 cgd * generation turned off and size counting turned on, and once "for real".
194 1.1 cgd * This also means that we don't allocate space until we are sure that the
195 1.1 cgd * thing really will compile successfully, and we never have to move the
196 1.1 cgd * code and thus invalidate pointers into it. (Note that it has to be in
197 1.1 cgd * one piece because free() must be able to free it all.)
198 1.1 cgd *
199 1.1 cgd * Beware that the optimization-preparation code in here knows about some
200 1.1 cgd * of the structure of the compiled regexp.
201 1.1 cgd */
202 1.1 cgd regexp *
203 1.1 cgd regcomp(exp)
204 1.1 cgd const char *exp;
205 1.1 cgd {
206 1.1 cgd register regexp *r;
207 1.1 cgd register char *scan;
208 1.1 cgd register char *longest;
209 1.1 cgd register int len;
210 1.1 cgd int flags;
211 1.1 cgd
212 1.1 cgd if (exp == NULL)
213 1.1 cgd FAIL("NULL argument");
214 1.1 cgd
215 1.1 cgd /* First pass: determine size, legality. */
216 1.1 cgd #ifdef notdef
217 1.1 cgd if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
218 1.1 cgd #endif
219 1.1 cgd regparse = (char *)exp;
220 1.1 cgd regnpar = 1;
221 1.1 cgd regsize = 0L;
222 1.1 cgd regcode = ®dummy;
223 1.1 cgd regc(MAGIC);
224 1.1 cgd if (reg(0, &flags) == NULL)
225 1.1 cgd return(NULL);
226 1.1 cgd
227 1.1 cgd /* Small enough for pointer-storage convention? */
228 1.1 cgd if (regsize >= 32767L) /* Probably could be 65535L. */
229 1.1 cgd FAIL("regexp too big");
230 1.1 cgd
231 1.1 cgd /* Allocate space. */
232 1.1 cgd r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
233 1.1 cgd if (r == NULL)
234 1.1 cgd FAIL("out of space");
235 1.1 cgd
236 1.1 cgd /* Second pass: emit code. */
237 1.1 cgd regparse = (char *)exp;
238 1.1 cgd regnpar = 1;
239 1.1 cgd regcode = r->program;
240 1.1 cgd regc(MAGIC);
241 1.1 cgd if (reg(0, &flags) == NULL)
242 1.1 cgd return(NULL);
243 1.1 cgd
244 1.1 cgd /* Dig out information for optimizations. */
245 1.1 cgd r->regstart = '\0'; /* Worst-case defaults. */
246 1.1 cgd r->reganch = 0;
247 1.1 cgd r->regmust = NULL;
248 1.1 cgd r->regmlen = 0;
249 1.1 cgd scan = r->program+1; /* First BRANCH. */
250 1.1 cgd if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
251 1.1 cgd scan = OPERAND(scan);
252 1.1 cgd
253 1.1 cgd /* Starting-point info. */
254 1.1 cgd if (OP(scan) == EXACTLY)
255 1.1 cgd r->regstart = *OPERAND(scan);
256 1.1 cgd else if (OP(scan) == BOL)
257 1.1 cgd r->reganch++;
258 1.1 cgd
259 1.1 cgd /*
260 1.1 cgd * If there's something expensive in the r.e., find the
261 1.1 cgd * longest literal string that must appear and make it the
262 1.1 cgd * regmust. Resolve ties in favor of later strings, since
263 1.1 cgd * the regstart check works with the beginning of the r.e.
264 1.1 cgd * and avoiding duplication strengthens checking. Not a
265 1.1 cgd * strong reason, but sufficient in the absence of others.
266 1.1 cgd */
267 1.1 cgd if (flags&SPSTART) {
268 1.1 cgd longest = NULL;
269 1.1 cgd len = 0;
270 1.1 cgd for (; scan != NULL; scan = regnext(scan))
271 1.1 cgd if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
272 1.1 cgd longest = OPERAND(scan);
273 1.1 cgd len = strlen(OPERAND(scan));
274 1.1 cgd }
275 1.1 cgd r->regmust = longest;
276 1.1 cgd r->regmlen = len;
277 1.1 cgd }
278 1.1 cgd }
279 1.1 cgd
280 1.1 cgd return(r);
281 1.1 cgd }
282 1.1 cgd
283 1.1 cgd /*
284 1.1 cgd - reg - regular expression, i.e. main body or parenthesized thing
285 1.1 cgd *
286 1.1 cgd * Caller must absorb opening parenthesis.
287 1.1 cgd *
288 1.1 cgd * Combining parenthesis handling with the base level of regular expression
289 1.1 cgd * is a trifle forced, but the need to tie the tails of the branches to what
290 1.1 cgd * follows makes it hard to avoid.
291 1.1 cgd */
292 1.1 cgd static char *
293 1.1 cgd reg(paren, flagp)
294 1.1 cgd int paren; /* Parenthesized? */
295 1.1 cgd int *flagp;
296 1.1 cgd {
297 1.1 cgd register char *ret;
298 1.1 cgd register char *br;
299 1.1 cgd register char *ender;
300 1.1 cgd register int parno;
301 1.1 cgd int flags;
302 1.1 cgd
303 1.1 cgd *flagp = HASWIDTH; /* Tentatively. */
304 1.1 cgd
305 1.1 cgd /* Make an OPEN node, if parenthesized. */
306 1.1 cgd if (paren) {
307 1.1 cgd if (regnpar >= NSUBEXP)
308 1.1 cgd FAIL("too many ()");
309 1.1 cgd parno = regnpar;
310 1.1 cgd regnpar++;
311 1.1 cgd ret = regnode(OPEN+parno);
312 1.1 cgd } else
313 1.1 cgd ret = NULL;
314 1.1 cgd
315 1.1 cgd /* Pick up the branches, linking them together. */
316 1.1 cgd br = regbranch(&flags);
317 1.1 cgd if (br == NULL)
318 1.1 cgd return(NULL);
319 1.1 cgd if (ret != NULL)
320 1.1 cgd regtail(ret, br); /* OPEN -> first. */
321 1.1 cgd else
322 1.1 cgd ret = br;
323 1.1 cgd if (!(flags&HASWIDTH))
324 1.1 cgd *flagp &= ~HASWIDTH;
325 1.1 cgd *flagp |= flags&SPSTART;
326 1.1 cgd while (*regparse == '|' || *regparse == '\n') {
327 1.1 cgd regparse++;
328 1.1 cgd br = regbranch(&flags);
329 1.1 cgd if (br == NULL)
330 1.1 cgd return(NULL);
331 1.1 cgd regtail(ret, br); /* BRANCH -> BRANCH. */
332 1.1 cgd if (!(flags&HASWIDTH))
333 1.1 cgd *flagp &= ~HASWIDTH;
334 1.1 cgd *flagp |= flags&SPSTART;
335 1.1 cgd }
336 1.1 cgd
337 1.1 cgd /* Make a closing node, and hook it on the end. */
338 1.1 cgd ender = regnode((paren) ? CLOSE+parno : END);
339 1.1 cgd regtail(ret, ender);
340 1.1 cgd
341 1.1 cgd /* Hook the tails of the branches to the closing node. */
342 1.1 cgd for (br = ret; br != NULL; br = regnext(br))
343 1.1 cgd regoptail(br, ender);
344 1.1 cgd
345 1.1 cgd /* Check for proper termination. */
346 1.1 cgd if (paren && *regparse++ != ')') {
347 1.1 cgd FAIL("unmatched ()");
348 1.1 cgd } else if (!paren && *regparse != '\0') {
349 1.1 cgd if (*regparse == ')') {
350 1.1 cgd FAIL("unmatched ()");
351 1.1 cgd } else
352 1.1 cgd FAIL("junk on end"); /* "Can't happen". */
353 1.1 cgd /* NOTREACHED */
354 1.1 cgd }
355 1.1 cgd
356 1.1 cgd return(ret);
357 1.1 cgd }
358 1.1 cgd
359 1.1 cgd /*
360 1.1 cgd - regbranch - one alternative of an | operator
361 1.1 cgd *
362 1.1 cgd * Implements the concatenation operator.
363 1.1 cgd */
364 1.1 cgd static char *
365 1.1 cgd regbranch(flagp)
366 1.1 cgd int *flagp;
367 1.1 cgd {
368 1.1 cgd register char *ret;
369 1.1 cgd register char *chain;
370 1.1 cgd register char *latest;
371 1.1 cgd int flags;
372 1.1 cgd
373 1.1 cgd *flagp = WORST; /* Tentatively. */
374 1.1 cgd
375 1.1 cgd ret = regnode(BRANCH);
376 1.1 cgd chain = NULL;
377 1.1 cgd while (*regparse != '\0' && *regparse != ')' &&
378 1.1 cgd *regparse != '\n' && *regparse != '|') {
379 1.1 cgd latest = regpiece(&flags);
380 1.1 cgd if (latest == NULL)
381 1.1 cgd return(NULL);
382 1.1 cgd *flagp |= flags&HASWIDTH;
383 1.1 cgd if (chain == NULL) /* First piece. */
384 1.1 cgd *flagp |= flags&SPSTART;
385 1.1 cgd else
386 1.1 cgd regtail(chain, latest);
387 1.1 cgd chain = latest;
388 1.1 cgd }
389 1.1 cgd if (chain == NULL) /* Loop ran zero times. */
390 1.1 cgd (void) regnode(NOTHING);
391 1.1 cgd
392 1.1 cgd return(ret);
393 1.1 cgd }
394 1.1 cgd
395 1.1 cgd /*
396 1.1 cgd - regpiece - something followed by possible [*+?]
397 1.1 cgd *
398 1.1 cgd * Note that the branching code sequences used for ? and the general cases
399 1.1 cgd * of * and + are somewhat optimized: they use the same NOTHING node as
400 1.1 cgd * both the endmarker for their branch list and the body of the last branch.
401 1.1 cgd * It might seem that this node could be dispensed with entirely, but the
402 1.1 cgd * endmarker role is not redundant.
403 1.1 cgd */
404 1.1 cgd static char *
405 1.1 cgd regpiece(flagp)
406 1.1 cgd int *flagp;
407 1.1 cgd {
408 1.1 cgd register char *ret;
409 1.1 cgd register char op;
410 1.1 cgd register char *next;
411 1.1 cgd int flags;
412 1.1 cgd
413 1.1 cgd ret = regatom(&flags);
414 1.1 cgd if (ret == NULL)
415 1.1 cgd return(NULL);
416 1.1 cgd
417 1.1 cgd op = *regparse;
418 1.1 cgd if (!ISMULT(op)) {
419 1.1 cgd *flagp = flags;
420 1.1 cgd return(ret);
421 1.1 cgd }
422 1.1 cgd
423 1.1 cgd if (!(flags&HASWIDTH) && op != '?')
424 1.1 cgd FAIL("*+ operand could be empty");
425 1.1 cgd *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
426 1.1 cgd
427 1.1 cgd if (op == '*' && (flags&SIMPLE))
428 1.1 cgd reginsert(STAR, ret);
429 1.1 cgd else if (op == '*') {
430 1.1 cgd /* Emit x* as (x&|), where & means "self". */
431 1.1 cgd reginsert(BRANCH, ret); /* Either x */
432 1.1 cgd regoptail(ret, regnode(BACK)); /* and loop */
433 1.1 cgd regoptail(ret, ret); /* back */
434 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
435 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
436 1.1 cgd } else if (op == '+' && (flags&SIMPLE))
437 1.1 cgd reginsert(PLUS, ret);
438 1.1 cgd else if (op == '+') {
439 1.1 cgd /* Emit x+ as x(&|), where & means "self". */
440 1.1 cgd next = regnode(BRANCH); /* Either */
441 1.1 cgd regtail(ret, next);
442 1.1 cgd regtail(regnode(BACK), ret); /* loop back */
443 1.1 cgd regtail(next, regnode(BRANCH)); /* or */
444 1.1 cgd regtail(ret, regnode(NOTHING)); /* null. */
445 1.1 cgd } else if (op == '?') {
446 1.1 cgd /* Emit x? as (x|) */
447 1.1 cgd reginsert(BRANCH, ret); /* Either x */
448 1.1 cgd regtail(ret, regnode(BRANCH)); /* or */
449 1.1 cgd next = regnode(NOTHING); /* null. */
450 1.1 cgd regtail(ret, next);
451 1.1 cgd regoptail(ret, next);
452 1.1 cgd }
453 1.1 cgd regparse++;
454 1.1 cgd if (ISMULT(*regparse))
455 1.1 cgd FAIL("nested *?+");
456 1.1 cgd
457 1.1 cgd return(ret);
458 1.1 cgd }
459 1.1 cgd
460 1.1 cgd /*
461 1.1 cgd - regatom - the lowest level
462 1.1 cgd *
463 1.1 cgd * Optimization: gobbles an entire sequence of ordinary characters so that
464 1.1 cgd * it can turn them into a single node, which is smaller to store and
465 1.1 cgd * faster to run. Backslashed characters are exceptions, each becoming a
466 1.1 cgd * separate node; the code is simpler that way and it's not worth fixing.
467 1.1 cgd */
468 1.1 cgd static char *
469 1.1 cgd regatom(flagp)
470 1.1 cgd int *flagp;
471 1.1 cgd {
472 1.1 cgd register char *ret;
473 1.1 cgd int flags;
474 1.1 cgd
475 1.1 cgd *flagp = WORST; /* Tentatively. */
476 1.1 cgd
477 1.1 cgd switch (*regparse++) {
478 1.1 cgd /* FIXME: these chars only have meaning at beg/end of pat? */
479 1.1 cgd case '^':
480 1.1 cgd ret = regnode(BOL);
481 1.1 cgd break;
482 1.1 cgd case '$':
483 1.1 cgd ret = regnode(EOL);
484 1.1 cgd break;
485 1.1 cgd case '.':
486 1.1 cgd ret = regnode(ANY);
487 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
488 1.1 cgd break;
489 1.1 cgd case '[': {
490 1.1 cgd register int class;
491 1.1 cgd register int classend;
492 1.1 cgd
493 1.1 cgd if (*regparse == '^') { /* Complement of range. */
494 1.1 cgd ret = regnode(ANYBUT);
495 1.1 cgd regparse++;
496 1.1 cgd } else
497 1.1 cgd ret = regnode(ANYOF);
498 1.1 cgd if (*regparse == ']' || *regparse == '-')
499 1.1 cgd regc(*regparse++);
500 1.1 cgd while (*regparse != '\0' && *regparse != ']') {
501 1.1 cgd if (*regparse == '-') {
502 1.1 cgd regparse++;
503 1.1 cgd if (*regparse == ']' || *regparse == '\0')
504 1.1 cgd regc('-');
505 1.1 cgd else {
506 1.1 cgd class = UCHARAT(regparse-2)+1;
507 1.1 cgd classend = UCHARAT(regparse);
508 1.1 cgd if (class > classend+1)
509 1.1 cgd FAIL("invalid [] range");
510 1.1 cgd for (; class <= classend; class++)
511 1.1 cgd regc(class);
512 1.1 cgd regparse++;
513 1.1 cgd }
514 1.1 cgd } else
515 1.1 cgd regc(*regparse++);
516 1.1 cgd }
517 1.1 cgd regc('\0');
518 1.1 cgd if (*regparse != ']')
519 1.1 cgd FAIL("unmatched []");
520 1.1 cgd regparse++;
521 1.1 cgd *flagp |= HASWIDTH|SIMPLE;
522 1.1 cgd }
523 1.1 cgd break;
524 1.1 cgd case '(':
525 1.1 cgd ret = reg(1, &flags);
526 1.1 cgd if (ret == NULL)
527 1.1 cgd return(NULL);
528 1.1 cgd *flagp |= flags&(HASWIDTH|SPSTART);
529 1.1 cgd break;
530 1.1 cgd case '\0':
531 1.1 cgd case '|':
532 1.1 cgd case '\n':
533 1.1 cgd case ')':
534 1.1 cgd FAIL("internal urp"); /* Supposed to be caught earlier. */
535 1.1 cgd break;
536 1.1 cgd case '?':
537 1.1 cgd case '+':
538 1.1 cgd case '*':
539 1.1 cgd FAIL("?+* follows nothing");
540 1.1 cgd break;
541 1.1 cgd case '\\':
542 1.1 cgd switch (*regparse++) {
543 1.1 cgd case '\0':
544 1.1 cgd FAIL("trailing \\");
545 1.1 cgd break;
546 1.1 cgd case '<':
547 1.1 cgd ret = regnode(WORDA);
548 1.1 cgd break;
549 1.1 cgd case '>':
550 1.1 cgd ret = regnode(WORDZ);
551 1.1 cgd break;
552 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
553 1.1 cgd default:
554 1.1 cgd /* Handle general quoted chars in exact-match routine */
555 1.1 cgd goto de_fault;
556 1.1 cgd }
557 1.1 cgd break;
558 1.1 cgd de_fault:
559 1.1 cgd default:
560 1.1 cgd /*
561 1.1 cgd * Encode a string of characters to be matched exactly.
562 1.1 cgd *
563 1.1 cgd * This is a bit tricky due to quoted chars and due to
564 1.1 cgd * '*', '+', and '?' taking the SINGLE char previous
565 1.1 cgd * as their operand.
566 1.1 cgd *
567 1.1 cgd * On entry, the char at regparse[-1] is going to go
568 1.1 cgd * into the string, no matter what it is. (It could be
569 1.1 cgd * following a \ if we are entered from the '\' case.)
570 1.1 cgd *
571 1.1 cgd * Basic idea is to pick up a good char in ch and
572 1.1 cgd * examine the next char. If it's *+? then we twiddle.
573 1.1 cgd * If it's \ then we frozzle. If it's other magic char
574 1.1 cgd * we push ch and terminate the string. If none of the
575 1.1 cgd * above, we push ch on the string and go around again.
576 1.1 cgd *
577 1.1 cgd * regprev is used to remember where "the current char"
578 1.1 cgd * starts in the string, if due to a *+? we need to back
579 1.1 cgd * up and put the current char in a separate, 1-char, string.
580 1.1 cgd * When regprev is NULL, ch is the only char in the
581 1.1 cgd * string; this is used in *+? handling, and in setting
582 1.1 cgd * flags |= SIMPLE at the end.
583 1.1 cgd */
584 1.1 cgd {
585 1.1 cgd char *regprev;
586 1.1 cgd register char ch;
587 1.1 cgd
588 1.1 cgd regparse--; /* Look at cur char */
589 1.1 cgd ret = regnode(EXACTLY);
590 1.1 cgd for ( regprev = 0 ; ; ) {
591 1.1 cgd ch = *regparse++; /* Get current char */
592 1.1 cgd switch (*regparse) { /* look at next one */
593 1.1 cgd
594 1.1 cgd default:
595 1.1 cgd regc(ch); /* Add cur to string */
596 1.1 cgd break;
597 1.1 cgd
598 1.1 cgd case '.': case '[': case '(':
599 1.1 cgd case ')': case '|': case '\n':
600 1.1 cgd case '$': case '^':
601 1.1 cgd case '\0':
602 1.1 cgd /* FIXME, $ and ^ should not always be magic */
603 1.1 cgd magic:
604 1.1 cgd regc(ch); /* dump cur char */
605 1.1 cgd goto done; /* and we are done */
606 1.1 cgd
607 1.1 cgd case '?': case '+': case '*':
608 1.1 cgd if (!regprev) /* If just ch in str, */
609 1.1 cgd goto magic; /* use it */
610 1.1 cgd /* End mult-char string one early */
611 1.1 cgd regparse = regprev; /* Back up parse */
612 1.1 cgd goto done;
613 1.1 cgd
614 1.1 cgd case '\\':
615 1.1 cgd regc(ch); /* Cur char OK */
616 1.1 cgd switch (regparse[1]){ /* Look after \ */
617 1.1 cgd case '\0':
618 1.1 cgd case '<':
619 1.1 cgd case '>':
620 1.1 cgd /* FIXME: Someday handle \1, \2, ... */
621 1.1 cgd goto done; /* Not quoted */
622 1.1 cgd default:
623 1.1 cgd /* Backup point is \, scan * point is after it. */
624 1.1 cgd regprev = regparse;
625 1.1 cgd regparse++;
626 1.1 cgd continue; /* NOT break; */
627 1.1 cgd }
628 1.1 cgd }
629 1.1 cgd regprev = regparse; /* Set backup point */
630 1.1 cgd }
631 1.1 cgd done:
632 1.1 cgd regc('\0');
633 1.1 cgd *flagp |= HASWIDTH;
634 1.1 cgd if (!regprev) /* One char? */
635 1.1 cgd *flagp |= SIMPLE;
636 1.1 cgd }
637 1.1 cgd break;
638 1.1 cgd }
639 1.1 cgd
640 1.1 cgd return(ret);
641 1.1 cgd }
642 1.1 cgd
643 1.1 cgd /*
644 1.1 cgd - regnode - emit a node
645 1.1 cgd */
646 1.1 cgd static char * /* Location. */
647 1.1 cgd regnode(op)
648 1.1 cgd char op;
649 1.1 cgd {
650 1.1 cgd register char *ret;
651 1.1 cgd register char *ptr;
652 1.1 cgd
653 1.1 cgd ret = regcode;
654 1.1 cgd if (ret == ®dummy) {
655 1.1 cgd regsize += 3;
656 1.1 cgd return(ret);
657 1.1 cgd }
658 1.1 cgd
659 1.1 cgd ptr = ret;
660 1.1 cgd *ptr++ = op;
661 1.1 cgd *ptr++ = '\0'; /* Null "next" pointer. */
662 1.1 cgd *ptr++ = '\0';
663 1.1 cgd regcode = ptr;
664 1.1 cgd
665 1.1 cgd return(ret);
666 1.1 cgd }
667 1.1 cgd
668 1.1 cgd /*
669 1.1 cgd - regc - emit (if appropriate) a byte of code
670 1.1 cgd */
671 1.1 cgd static void
672 1.1 cgd regc(b)
673 1.1 cgd char b;
674 1.1 cgd {
675 1.1 cgd if (regcode != ®dummy)
676 1.1 cgd *regcode++ = b;
677 1.1 cgd else
678 1.1 cgd regsize++;
679 1.1 cgd }
680 1.1 cgd
681 1.1 cgd /*
682 1.1 cgd - reginsert - insert an operator in front of already-emitted operand
683 1.1 cgd *
684 1.1 cgd * Means relocating the operand.
685 1.1 cgd */
686 1.1 cgd static void
687 1.1 cgd reginsert(op, opnd)
688 1.1 cgd char op;
689 1.1 cgd char *opnd;
690 1.1 cgd {
691 1.1 cgd register char *src;
692 1.1 cgd register char *dst;
693 1.1 cgd register char *place;
694 1.1 cgd
695 1.1 cgd if (regcode == ®dummy) {
696 1.1 cgd regsize += 3;
697 1.1 cgd return;
698 1.1 cgd }
699 1.1 cgd
700 1.1 cgd src = regcode;
701 1.1 cgd regcode += 3;
702 1.1 cgd dst = regcode;
703 1.1 cgd while (src > opnd)
704 1.1 cgd *--dst = *--src;
705 1.1 cgd
706 1.1 cgd place = opnd; /* Op node, where operand used to be. */
707 1.1 cgd *place++ = op;
708 1.1 cgd *place++ = '\0';
709 1.1 cgd *place++ = '\0';
710 1.1 cgd }
711 1.1 cgd
712 1.1 cgd /*
713 1.1 cgd - regtail - set the next-pointer at the end of a node chain
714 1.1 cgd */
715 1.1 cgd static void
716 1.1 cgd regtail(p, val)
717 1.1 cgd char *p;
718 1.1 cgd char *val;
719 1.1 cgd {
720 1.1 cgd register char *scan;
721 1.1 cgd register char *temp;
722 1.1 cgd register int offset;
723 1.1 cgd
724 1.1 cgd if (p == ®dummy)
725 1.1 cgd return;
726 1.1 cgd
727 1.1 cgd /* Find last node. */
728 1.1 cgd scan = p;
729 1.1 cgd for (;;) {
730 1.1 cgd temp = regnext(scan);
731 1.1 cgd if (temp == NULL)
732 1.1 cgd break;
733 1.1 cgd scan = temp;
734 1.1 cgd }
735 1.1 cgd
736 1.1 cgd if (OP(scan) == BACK)
737 1.1 cgd offset = scan - val;
738 1.1 cgd else
739 1.1 cgd offset = val - scan;
740 1.1 cgd *(scan+1) = (offset>>8)&0377;
741 1.1 cgd *(scan+2) = offset&0377;
742 1.1 cgd }
743 1.1 cgd
744 1.1 cgd /*
745 1.1 cgd - regoptail - regtail on operand of first argument; nop if operandless
746 1.1 cgd */
747 1.1 cgd static void
748 1.1 cgd regoptail(p, val)
749 1.1 cgd char *p;
750 1.1 cgd char *val;
751 1.1 cgd {
752 1.1 cgd /* "Operandless" and "op != BRANCH" are synonymous in practice. */
753 1.1 cgd if (p == NULL || p == ®dummy || OP(p) != BRANCH)
754 1.1 cgd return;
755 1.1 cgd regtail(OPERAND(p), val);
756 1.1 cgd }
757 1.1 cgd
758 1.1 cgd /*
759 1.1 cgd * regexec and friends
760 1.1 cgd */
761 1.1 cgd
762 1.1 cgd /*
763 1.1 cgd * Global work variables for regexec().
764 1.1 cgd */
765 1.1 cgd static char *reginput; /* String-input pointer. */
766 1.1 cgd static char *regbol; /* Beginning of input, for ^ check. */
767 1.1 cgd static char **regstartp; /* Pointer to startp array. */
768 1.1 cgd static char **regendp; /* Ditto for endp. */
769 1.1 cgd
770 1.1 cgd /*
771 1.1 cgd * Forwards.
772 1.1 cgd */
773 1.1 cgd STATIC int regtry();
774 1.1 cgd STATIC int regmatch();
775 1.1 cgd STATIC int regrepeat();
776 1.1 cgd
777 1.1 cgd #ifdef DEBUG
778 1.1 cgd int regnarrate = 0;
779 1.1 cgd void regdump();
780 1.1 cgd STATIC char *regprop();
781 1.1 cgd #endif
782 1.1 cgd
783 1.1 cgd /*
784 1.1 cgd - regexec - match a regexp against a string
785 1.1 cgd */
786 1.1 cgd int
787 1.1 cgd regexec(prog, string)
788 1.1 cgd register const regexp *prog;
789 1.1 cgd register const char *string;
790 1.1 cgd {
791 1.1 cgd register char *s;
792 1.1 cgd extern char *strchr();
793 1.1 cgd
794 1.1 cgd /* Be paranoid... */
795 1.1 cgd if (prog == NULL || string == NULL) {
796 1.1 cgd regerror("NULL parameter");
797 1.1 cgd return(0);
798 1.1 cgd }
799 1.1 cgd
800 1.1 cgd /* Check validity of program. */
801 1.1 cgd if (UCHARAT(prog->program) != MAGIC) {
802 1.1 cgd regerror("corrupted program");
803 1.1 cgd return(0);
804 1.1 cgd }
805 1.1 cgd
806 1.1 cgd /* If there is a "must appear" string, look for it. */
807 1.1 cgd if (prog->regmust != NULL) {
808 1.1 cgd s = (char *)string;
809 1.1 cgd while ((s = strchr(s, prog->regmust[0])) != NULL) {
810 1.1 cgd if (strncmp(s, prog->regmust, prog->regmlen) == 0)
811 1.1 cgd break; /* Found it. */
812 1.1 cgd s++;
813 1.1 cgd }
814 1.1 cgd if (s == NULL) /* Not present. */
815 1.1 cgd return(0);
816 1.1 cgd }
817 1.1 cgd
818 1.1 cgd /* Mark beginning of line for ^ . */
819 1.1 cgd regbol = (char *)string;
820 1.1 cgd
821 1.1 cgd /* Simplest case: anchored match need be tried only once. */
822 1.1 cgd if (prog->reganch)
823 1.1 cgd return(regtry(prog, string));
824 1.1 cgd
825 1.1 cgd /* Messy cases: unanchored match. */
826 1.1 cgd s = (char *)string;
827 1.1 cgd if (prog->regstart != '\0')
828 1.1 cgd /* We know what char it must start with. */
829 1.1 cgd while ((s = strchr(s, prog->regstart)) != NULL) {
830 1.1 cgd if (regtry(prog, s))
831 1.1 cgd return(1);
832 1.1 cgd s++;
833 1.1 cgd }
834 1.1 cgd else
835 1.1 cgd /* We don't -- general case. */
836 1.1 cgd do {
837 1.1 cgd if (regtry(prog, s))
838 1.1 cgd return(1);
839 1.1 cgd } while (*s++ != '\0');
840 1.1 cgd
841 1.1 cgd /* Failure. */
842 1.1 cgd return(0);
843 1.1 cgd }
844 1.1 cgd
845 1.1 cgd /*
846 1.1 cgd - regtry - try match at specific point
847 1.1 cgd */
848 1.1 cgd static int /* 0 failure, 1 success */
849 1.1 cgd regtry(prog, string)
850 1.1 cgd regexp *prog;
851 1.1 cgd char *string;
852 1.1 cgd {
853 1.1 cgd register int i;
854 1.1 cgd register char **sp;
855 1.1 cgd register char **ep;
856 1.1 cgd
857 1.1 cgd reginput = string;
858 1.1 cgd regstartp = prog->startp;
859 1.1 cgd regendp = prog->endp;
860 1.1 cgd
861 1.1 cgd sp = prog->startp;
862 1.1 cgd ep = prog->endp;
863 1.1 cgd for (i = NSUBEXP; i > 0; i--) {
864 1.1 cgd *sp++ = NULL;
865 1.1 cgd *ep++ = NULL;
866 1.1 cgd }
867 1.1 cgd if (regmatch(prog->program + 1)) {
868 1.1 cgd prog->startp[0] = string;
869 1.1 cgd prog->endp[0] = reginput;
870 1.1 cgd return(1);
871 1.1 cgd } else
872 1.1 cgd return(0);
873 1.1 cgd }
874 1.1 cgd
875 1.1 cgd /*
876 1.1 cgd - regmatch - main matching routine
877 1.1 cgd *
878 1.1 cgd * Conceptually the strategy is simple: check to see whether the current
879 1.1 cgd * node matches, call self recursively to see whether the rest matches,
880 1.1 cgd * and then act accordingly. In practice we make some effort to avoid
881 1.1 cgd * recursion, in particular by going through "ordinary" nodes (that don't
882 1.1 cgd * need to know whether the rest of the match failed) by a loop instead of
883 1.1 cgd * by recursion.
884 1.1 cgd */
885 1.1 cgd static int /* 0 failure, 1 success */
886 1.1 cgd regmatch(prog)
887 1.1 cgd char *prog;
888 1.1 cgd {
889 1.1 cgd register char *scan; /* Current node. */
890 1.1 cgd char *next; /* Next node. */
891 1.1 cgd extern char *strchr();
892 1.1 cgd
893 1.1 cgd scan = prog;
894 1.1 cgd #ifdef DEBUG
895 1.1 cgd if (scan != NULL && regnarrate)
896 1.1 cgd fprintf(stderr, "%s(\n", regprop(scan));
897 1.1 cgd #endif
898 1.1 cgd while (scan != NULL) {
899 1.1 cgd #ifdef DEBUG
900 1.1 cgd if (regnarrate)
901 1.1 cgd fprintf(stderr, "%s...\n", regprop(scan));
902 1.1 cgd #endif
903 1.1 cgd next = regnext(scan);
904 1.1 cgd
905 1.1 cgd switch (OP(scan)) {
906 1.1 cgd case BOL:
907 1.1 cgd if (reginput != regbol)
908 1.1 cgd return(0);
909 1.1 cgd break;
910 1.1 cgd case EOL:
911 1.1 cgd if (*reginput != '\0')
912 1.1 cgd return(0);
913 1.1 cgd break;
914 1.1 cgd case WORDA:
915 1.1 cgd /* Must be looking at a letter, digit, or _ */
916 1.1 cgd if ((!isalnum(*reginput)) && *reginput != '_')
917 1.1 cgd return(0);
918 1.1 cgd /* Prev must be BOL or nonword */
919 1.1 cgd if (reginput > regbol &&
920 1.1 cgd (isalnum(reginput[-1]) || reginput[-1] == '_'))
921 1.1 cgd return(0);
922 1.1 cgd break;
923 1.1 cgd case WORDZ:
924 1.1 cgd /* Must be looking at non letter, digit, or _ */
925 1.1 cgd if (isalnum(*reginput) || *reginput == '_')
926 1.1 cgd return(0);
927 1.1 cgd /* We don't care what the previous char was */
928 1.1 cgd break;
929 1.1 cgd case ANY:
930 1.1 cgd if (*reginput == '\0')
931 1.1 cgd return(0);
932 1.1 cgd reginput++;
933 1.1 cgd break;
934 1.1 cgd case EXACTLY: {
935 1.1 cgd register int len;
936 1.1 cgd register char *opnd;
937 1.1 cgd
938 1.1 cgd opnd = OPERAND(scan);
939 1.1 cgd /* Inline the first character, for speed. */
940 1.1 cgd if (*opnd != *reginput)
941 1.1 cgd return(0);
942 1.1 cgd len = strlen(opnd);
943 1.1 cgd if (len > 1 && strncmp(opnd, reginput, len) != 0)
944 1.1 cgd return(0);
945 1.1 cgd reginput += len;
946 1.1 cgd }
947 1.1 cgd break;
948 1.1 cgd case ANYOF:
949 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
950 1.1 cgd return(0);
951 1.1 cgd reginput++;
952 1.1 cgd break;
953 1.1 cgd case ANYBUT:
954 1.1 cgd if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
955 1.1 cgd return(0);
956 1.1 cgd reginput++;
957 1.1 cgd break;
958 1.1 cgd case NOTHING:
959 1.1 cgd break;
960 1.1 cgd case BACK:
961 1.1 cgd break;
962 1.1 cgd case OPEN+1:
963 1.1 cgd case OPEN+2:
964 1.1 cgd case OPEN+3:
965 1.1 cgd case OPEN+4:
966 1.1 cgd case OPEN+5:
967 1.1 cgd case OPEN+6:
968 1.1 cgd case OPEN+7:
969 1.1 cgd case OPEN+8:
970 1.1 cgd case OPEN+9: {
971 1.1 cgd register int no;
972 1.1 cgd register char *save;
973 1.1 cgd
974 1.1 cgd no = OP(scan) - OPEN;
975 1.1 cgd save = reginput;
976 1.1 cgd
977 1.1 cgd if (regmatch(next)) {
978 1.1 cgd /*
979 1.1 cgd * Don't set startp if some later
980 1.1 cgd * invocation of the same parentheses
981 1.1 cgd * already has.
982 1.1 cgd */
983 1.1 cgd if (regstartp[no] == NULL)
984 1.1 cgd regstartp[no] = save;
985 1.1 cgd return(1);
986 1.1 cgd } else
987 1.1 cgd return(0);
988 1.1 cgd }
989 1.1 cgd break;
990 1.1 cgd case CLOSE+1:
991 1.1 cgd case CLOSE+2:
992 1.1 cgd case CLOSE+3:
993 1.1 cgd case CLOSE+4:
994 1.1 cgd case CLOSE+5:
995 1.1 cgd case CLOSE+6:
996 1.1 cgd case CLOSE+7:
997 1.1 cgd case CLOSE+8:
998 1.1 cgd case CLOSE+9: {
999 1.1 cgd register int no;
1000 1.1 cgd register char *save;
1001 1.1 cgd
1002 1.1 cgd no = OP(scan) - CLOSE;
1003 1.1 cgd save = reginput;
1004 1.1 cgd
1005 1.1 cgd if (regmatch(next)) {
1006 1.1 cgd /*
1007 1.1 cgd * Don't set endp if some later
1008 1.1 cgd * invocation of the same parentheses
1009 1.1 cgd * already has.
1010 1.1 cgd */
1011 1.1 cgd if (regendp[no] == NULL)
1012 1.1 cgd regendp[no] = save;
1013 1.1 cgd return(1);
1014 1.1 cgd } else
1015 1.1 cgd return(0);
1016 1.1 cgd }
1017 1.1 cgd break;
1018 1.1 cgd case BRANCH: {
1019 1.1 cgd register char *save;
1020 1.1 cgd
1021 1.1 cgd if (OP(next) != BRANCH) /* No choice. */
1022 1.1 cgd next = OPERAND(scan); /* Avoid recursion. */
1023 1.1 cgd else {
1024 1.1 cgd do {
1025 1.1 cgd save = reginput;
1026 1.1 cgd if (regmatch(OPERAND(scan)))
1027 1.1 cgd return(1);
1028 1.1 cgd reginput = save;
1029 1.1 cgd scan = regnext(scan);
1030 1.1 cgd } while (scan != NULL && OP(scan) == BRANCH);
1031 1.1 cgd return(0);
1032 1.1 cgd /* NOTREACHED */
1033 1.1 cgd }
1034 1.1 cgd }
1035 1.1 cgd break;
1036 1.1 cgd case STAR:
1037 1.1 cgd case PLUS: {
1038 1.1 cgd register char nextch;
1039 1.1 cgd register int no;
1040 1.1 cgd register char *save;
1041 1.1 cgd register int min;
1042 1.1 cgd
1043 1.1 cgd /*
1044 1.1 cgd * Lookahead to avoid useless match attempts
1045 1.1 cgd * when we know what character comes next.
1046 1.1 cgd */
1047 1.1 cgd nextch = '\0';
1048 1.1 cgd if (OP(next) == EXACTLY)
1049 1.1 cgd nextch = *OPERAND(next);
1050 1.1 cgd min = (OP(scan) == STAR) ? 0 : 1;
1051 1.1 cgd save = reginput;
1052 1.1 cgd no = regrepeat(OPERAND(scan));
1053 1.1 cgd while (no >= min) {
1054 1.1 cgd /* If it could work, try it. */
1055 1.1 cgd if (nextch == '\0' || *reginput == nextch)
1056 1.1 cgd if (regmatch(next))
1057 1.1 cgd return(1);
1058 1.1 cgd /* Couldn't or didn't -- back up. */
1059 1.1 cgd no--;
1060 1.1 cgd reginput = save + no;
1061 1.1 cgd }
1062 1.1 cgd return(0);
1063 1.1 cgd }
1064 1.1 cgd break;
1065 1.1 cgd case END:
1066 1.1 cgd return(1); /* Success! */
1067 1.1 cgd break;
1068 1.1 cgd default:
1069 1.1 cgd regerror("memory corruption");
1070 1.1 cgd return(0);
1071 1.1 cgd break;
1072 1.1 cgd }
1073 1.1 cgd
1074 1.1 cgd scan = next;
1075 1.1 cgd }
1076 1.1 cgd
1077 1.1 cgd /*
1078 1.1 cgd * We get here only if there's trouble -- normally "case END" is
1079 1.1 cgd * the terminating point.
1080 1.1 cgd */
1081 1.1 cgd regerror("corrupted pointers");
1082 1.1 cgd return(0);
1083 1.1 cgd }
1084 1.1 cgd
1085 1.1 cgd /*
1086 1.1 cgd - regrepeat - repeatedly match something simple, report how many
1087 1.1 cgd */
1088 1.1 cgd static int
1089 1.1 cgd regrepeat(p)
1090 1.1 cgd char *p;
1091 1.1 cgd {
1092 1.1 cgd register int count = 0;
1093 1.1 cgd register char *scan;
1094 1.1 cgd register char *opnd;
1095 1.1 cgd
1096 1.1 cgd scan = reginput;
1097 1.1 cgd opnd = OPERAND(p);
1098 1.1 cgd switch (OP(p)) {
1099 1.1 cgd case ANY:
1100 1.1 cgd count = strlen(scan);
1101 1.1 cgd scan += count;
1102 1.1 cgd break;
1103 1.1 cgd case EXACTLY:
1104 1.1 cgd while (*opnd == *scan) {
1105 1.1 cgd count++;
1106 1.1 cgd scan++;
1107 1.1 cgd }
1108 1.1 cgd break;
1109 1.1 cgd case ANYOF:
1110 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1111 1.1 cgd count++;
1112 1.1 cgd scan++;
1113 1.1 cgd }
1114 1.1 cgd break;
1115 1.1 cgd case ANYBUT:
1116 1.1 cgd while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1117 1.1 cgd count++;
1118 1.1 cgd scan++;
1119 1.1 cgd }
1120 1.1 cgd break;
1121 1.1 cgd default: /* Oh dear. Called inappropriately. */
1122 1.1 cgd regerror("internal foulup");
1123 1.1 cgd count = 0; /* Best compromise. */
1124 1.1 cgd break;
1125 1.1 cgd }
1126 1.1 cgd reginput = scan;
1127 1.1 cgd
1128 1.1 cgd return(count);
1129 1.1 cgd }
1130 1.1 cgd
1131 1.1 cgd /*
1132 1.1 cgd - regnext - dig the "next" pointer out of a node
1133 1.1 cgd */
1134 1.1 cgd static char *
1135 1.1 cgd regnext(p)
1136 1.1 cgd register char *p;
1137 1.1 cgd {
1138 1.1 cgd register int offset;
1139 1.1 cgd
1140 1.1 cgd if (p == ®dummy)
1141 1.1 cgd return(NULL);
1142 1.1 cgd
1143 1.1 cgd offset = NEXT(p);
1144 1.1 cgd if (offset == 0)
1145 1.1 cgd return(NULL);
1146 1.1 cgd
1147 1.1 cgd if (OP(p) == BACK)
1148 1.1 cgd return(p-offset);
1149 1.1 cgd else
1150 1.1 cgd return(p+offset);
1151 1.1 cgd }
1152 1.1 cgd
1153 1.1 cgd #ifdef DEBUG
1154 1.1 cgd
1155 1.1 cgd STATIC char *regprop();
1156 1.1 cgd
1157 1.1 cgd /*
1158 1.1 cgd - regdump - dump a regexp onto stdout in vaguely comprehensible form
1159 1.1 cgd */
1160 1.1 cgd void
1161 1.1 cgd regdump(r)
1162 1.1 cgd regexp *r;
1163 1.1 cgd {
1164 1.1 cgd register char *s;
1165 1.1 cgd register char op = EXACTLY; /* Arbitrary non-END op. */
1166 1.1 cgd register char *next;
1167 1.1 cgd extern char *strchr();
1168 1.1 cgd
1169 1.1 cgd
1170 1.1 cgd s = r->program + 1;
1171 1.1 cgd while (op != END) { /* While that wasn't END last time... */
1172 1.1 cgd op = OP(s);
1173 1.1 cgd printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1174 1.1 cgd next = regnext(s);
1175 1.1 cgd if (next == NULL) /* Next ptr. */
1176 1.1 cgd printf("(0)");
1177 1.1 cgd else
1178 1.1 cgd printf("(%d)", (s-r->program)+(next-s));
1179 1.1 cgd s += 3;
1180 1.1 cgd if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1181 1.1 cgd /* Literal string, where present. */
1182 1.1 cgd while (*s != '\0') {
1183 1.1 cgd putchar(*s);
1184 1.1 cgd s++;
1185 1.1 cgd }
1186 1.1 cgd s++;
1187 1.1 cgd }
1188 1.1 cgd putchar('\n');
1189 1.1 cgd }
1190 1.1 cgd
1191 1.1 cgd /* Header fields of interest. */
1192 1.1 cgd if (r->regstart != '\0')
1193 1.1 cgd printf("start `%c' ", r->regstart);
1194 1.1 cgd if (r->reganch)
1195 1.1 cgd printf("anchored ");
1196 1.1 cgd if (r->regmust != NULL)
1197 1.1 cgd printf("must have \"%s\"", r->regmust);
1198 1.1 cgd printf("\n");
1199 1.1 cgd }
1200 1.1 cgd
1201 1.1 cgd /*
1202 1.1 cgd - regprop - printable representation of opcode
1203 1.1 cgd */
1204 1.1 cgd static char *
1205 1.1 cgd regprop(op)
1206 1.1 cgd char *op;
1207 1.1 cgd {
1208 1.1 cgd register char *p;
1209 1.1 cgd static char buf[50];
1210 1.1 cgd
1211 1.1 cgd (void) strcpy(buf, ":");
1212 1.1 cgd
1213 1.1 cgd switch (OP(op)) {
1214 1.1 cgd case BOL:
1215 1.1 cgd p = "BOL";
1216 1.1 cgd break;
1217 1.1 cgd case EOL:
1218 1.1 cgd p = "EOL";
1219 1.1 cgd break;
1220 1.1 cgd case ANY:
1221 1.1 cgd p = "ANY";
1222 1.1 cgd break;
1223 1.1 cgd case ANYOF:
1224 1.1 cgd p = "ANYOF";
1225 1.1 cgd break;
1226 1.1 cgd case ANYBUT:
1227 1.1 cgd p = "ANYBUT";
1228 1.1 cgd break;
1229 1.1 cgd case BRANCH:
1230 1.1 cgd p = "BRANCH";
1231 1.1 cgd break;
1232 1.1 cgd case EXACTLY:
1233 1.1 cgd p = "EXACTLY";
1234 1.1 cgd break;
1235 1.1 cgd case NOTHING:
1236 1.1 cgd p = "NOTHING";
1237 1.1 cgd break;
1238 1.1 cgd case BACK:
1239 1.1 cgd p = "BACK";
1240 1.1 cgd break;
1241 1.1 cgd case END:
1242 1.1 cgd p = "END";
1243 1.1 cgd break;
1244 1.1 cgd case OPEN+1:
1245 1.1 cgd case OPEN+2:
1246 1.1 cgd case OPEN+3:
1247 1.1 cgd case OPEN+4:
1248 1.1 cgd case OPEN+5:
1249 1.1 cgd case OPEN+6:
1250 1.1 cgd case OPEN+7:
1251 1.1 cgd case OPEN+8:
1252 1.1 cgd case OPEN+9:
1253 1.1 cgd sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1254 1.1 cgd p = NULL;
1255 1.1 cgd break;
1256 1.1 cgd case CLOSE+1:
1257 1.1 cgd case CLOSE+2:
1258 1.1 cgd case CLOSE+3:
1259 1.1 cgd case CLOSE+4:
1260 1.1 cgd case CLOSE+5:
1261 1.1 cgd case CLOSE+6:
1262 1.1 cgd case CLOSE+7:
1263 1.1 cgd case CLOSE+8:
1264 1.1 cgd case CLOSE+9:
1265 1.1 cgd sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1266 1.1 cgd p = NULL;
1267 1.1 cgd break;
1268 1.1 cgd case STAR:
1269 1.1 cgd p = "STAR";
1270 1.1 cgd break;
1271 1.1 cgd case PLUS:
1272 1.1 cgd p = "PLUS";
1273 1.1 cgd break;
1274 1.1 cgd case WORDA:
1275 1.1 cgd p = "WORDA";
1276 1.1 cgd break;
1277 1.1 cgd case WORDZ:
1278 1.1 cgd p = "WORDZ";
1279 1.1 cgd break;
1280 1.1 cgd default:
1281 1.1 cgd regerror("corrupted opcode");
1282 1.1 cgd break;
1283 1.1 cgd }
1284 1.1 cgd if (p != NULL)
1285 1.1 cgd (void) strcat(buf, p);
1286 1.1 cgd return(buf);
1287 1.1 cgd }
1288 1.1 cgd #endif
1289 1.1 cgd
1290 1.1 cgd /*
1291 1.1 cgd * The following is provided for those people who do not have strcspn() in
1292 1.1 cgd * their C libraries. They should get off their butts and do something
1293 1.1 cgd * about it; at least one public-domain implementation of those (highly
1294 1.1 cgd * useful) string routines has been published on Usenet.
1295 1.1 cgd */
1296 1.1 cgd #ifdef STRCSPN
1297 1.1 cgd /*
1298 1.1 cgd * strcspn - find length of initial segment of s1 consisting entirely
1299 1.1 cgd * of characters not from s2
1300 1.1 cgd */
1301 1.1 cgd
1302 1.1 cgd static int
1303 1.1 cgd strcspn(s1, s2)
1304 1.1 cgd char *s1;
1305 1.1 cgd char *s2;
1306 1.1 cgd {
1307 1.1 cgd register char *scan1;
1308 1.1 cgd register char *scan2;
1309 1.1 cgd register int count;
1310 1.1 cgd
1311 1.1 cgd count = 0;
1312 1.1 cgd for (scan1 = s1; *scan1 != '\0'; scan1++) {
1313 1.1 cgd for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1314 1.1 cgd if (*scan1 == *scan2++)
1315 1.1 cgd return(count);
1316 1.1 cgd count++;
1317 1.1 cgd }
1318 1.1 cgd return(count);
1319 1.1 cgd }
1320 1.1 cgd #endif
1321