engine.c revision 1.9 1 /* $NetBSD: engine.c,v 1.9 1998/11/14 16:43:49 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * @(#)engine.c 8.5 (Berkeley) 3/20/94
40 */
41
42 /*
43 * The matching engine and friends. This file is #included by regexec.c
44 * after suitable #defines of a variety of macros used herein, so that
45 * different state representations can be used without duplicating masses
46 * of code.
47 */
48
49 #ifdef SNAMES
50 #define matcher smatcher
51 #define fast sfast
52 #define slow sslow
53 #define dissect sdissect
54 #define backref sbackref
55 #define step sstep
56 #define print sprint
57 #define at sat
58 #define match smat
59 #endif
60 #ifdef LNAMES
61 #define matcher lmatcher
62 #define fast lfast
63 #define slow lslow
64 #define dissect ldissect
65 #define backref lbackref
66 #define step lstep
67 #define print lprint
68 #define at lat
69 #define match lmat
70 #endif
71
72 /* another structure passed up and down to avoid zillions of parameters */
73 struct match {
74 struct re_guts *g;
75 int eflags;
76 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
77 char *offp; /* offsets work from here */
78 char *beginp; /* start of string -- virtual NUL precedes */
79 char *endp; /* end of string -- virtual NUL here */
80 char *coldp; /* can be no match starting before here */
81 char **lastpos; /* [nplus+1] */
82 STATEVARS;
83 states st; /* current states */
84 states fresh; /* states for a fresh start */
85 states tmp; /* temporary */
86 states empty; /* empty set of states */
87 };
88
89 /* ========= begin header generated by ./mkh ========= */
90 #ifdef __cplusplus
91 extern "C" {
92 #endif
93
94 /* === engine.c === */
95 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
96 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
97 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
98 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
99 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
100 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
101 #define BOL (OUT+1)
102 #define EOL (BOL+1)
103 #define BOLEOL (BOL+2)
104 #define NOTHING (BOL+3)
105 #define BOW (BOL+4)
106 #define EOW (BOL+5)
107 #define CODEMAX (BOL+5) /* highest code used */
108 #define NONCHAR(c) ((c) > CHAR_MAX)
109 #define NNONCHAR (CODEMAX-CHAR_MAX)
110 #ifdef REDEBUG
111 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
112 #endif
113 #ifdef REDEBUG
114 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
115 #endif
116 #ifdef REDEBUG
117 static char *pchar __P((int ch));
118 #endif
119
120 #ifdef __cplusplus
121 }
122 #endif
123 /* ========= end header generated by ./mkh ========= */
124
125 #ifdef REDEBUG
126 #define SP(t, s, c) print(m, t, s, c, stdout)
127 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
128 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
129 #else
130 #define SP(t, s, c) /* nothing */
131 #define AT(t, p1, p2, s1, s2) /* nothing */
132 #define NOTE(s) /* nothing */
133 #endif
134
135 /*
136 - matcher - the actual matching engine
137 == static int matcher(struct re_guts *g, char *string, \
138 == size_t nmatch, regmatch_t pmatch[], int eflags);
139 */
140 static int /* 0 success, REG_NOMATCH failure */
141 matcher(g, string, nmatch, pmatch, eflags)
142 struct re_guts *g;
143 char *string;
144 size_t nmatch;
145 regmatch_t pmatch[];
146 int eflags;
147 {
148 char *endp;
149 int i;
150 struct match mv;
151 struct match *m = &mv;
152 char *dp;
153 const sopno gf = g->firststate+1; /* +1 for OEND */
154 const sopno gl = g->laststate;
155 char *start;
156 char *stop;
157
158 /* simplify the situation where possible */
159 if (g->cflags®_NOSUB)
160 nmatch = 0;
161 if (eflags®_STARTEND) {
162 start = string + (size_t)pmatch[0].rm_so;
163 stop = string + (size_t)pmatch[0].rm_eo;
164 } else {
165 start = string;
166 stop = start + strlen(start);
167 }
168 if (stop < start)
169 return(REG_INVARG);
170
171 /* prescreening; this does wonders for this rather slow code */
172 if (g->must != NULL) {
173 for (dp = start; dp < stop; dp++)
174 if (*dp == g->must[0] && stop - dp >= g->mlen &&
175 memcmp(dp, g->must, (size_t)g->mlen) == 0)
176 break;
177 if (dp == stop) /* we didn't find g->must */
178 return(REG_NOMATCH);
179 }
180
181 /* match struct setup */
182 m->g = g;
183 m->eflags = eflags;
184 m->pmatch = NULL;
185 m->lastpos = NULL;
186 m->offp = string;
187 m->beginp = start;
188 m->endp = stop;
189 STATESETUP(m, 4);
190 SETUP(m->st);
191 SETUP(m->fresh);
192 SETUP(m->tmp);
193 SETUP(m->empty);
194 CLEAR(m->empty);
195
196 /* this loop does only one repetition except for backrefs */
197 for (;;) {
198 endp = fast(m, start, stop, gf, gl);
199 if (endp == NULL) { /* a miss */
200 STATETEARDOWN(m);
201 return(REG_NOMATCH);
202 }
203 if (nmatch == 0 && !g->backrefs)
204 break; /* no further info needed */
205
206 /* where? */
207 assert(m->coldp != NULL);
208 for (;;) {
209 NOTE("finding start");
210 endp = slow(m, m->coldp, stop, gf, gl);
211 if (endp != NULL)
212 break;
213 assert(m->coldp < m->endp);
214 m->coldp++;
215 }
216 if (nmatch == 1 && !g->backrefs)
217 break; /* no further info needed */
218
219 /* oh my, he wants the subexpressions... */
220 if (m->pmatch == NULL)
221 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
222 sizeof(regmatch_t));
223 if (m->pmatch == NULL) {
224 STATETEARDOWN(m);
225 return(REG_ESPACE);
226 }
227 for (i = 1; i <= m->g->nsub; i++)
228 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
229 if (!g->backrefs && !(m->eflags®_BACKR)) {
230 NOTE("dissecting");
231 dp = dissect(m, m->coldp, endp, gf, gl);
232 } else {
233 if (g->nplus > 0 && m->lastpos == NULL)
234 m->lastpos = (char **)malloc((g->nplus+1) *
235 sizeof(char *));
236 if (g->nplus > 0 && m->lastpos == NULL) {
237 free(m->pmatch);
238 STATETEARDOWN(m);
239 return(REG_ESPACE);
240 }
241 NOTE("backref dissect");
242 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
243 }
244 if (dp != NULL)
245 break;
246
247 /* uh-oh... we couldn't find a subexpression-level match */
248 assert(g->backrefs); /* must be back references doing it */
249 assert(g->nplus == 0 || m->lastpos != NULL);
250 for (;;) {
251 if (dp != NULL || endp <= m->coldp)
252 break; /* defeat */
253 NOTE("backoff");
254 endp = slow(m, m->coldp, endp-1, gf, gl);
255 if (endp == NULL)
256 break; /* defeat */
257 /* try it on a shorter possibility */
258 #ifndef NDEBUG
259 for (i = 1; i <= m->g->nsub; i++) {
260 assert(m->pmatch[i].rm_so == -1);
261 assert(m->pmatch[i].rm_eo == -1);
262 }
263 #endif
264 NOTE("backoff dissect");
265 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
266 }
267 assert(dp == NULL || dp == endp);
268 if (dp != NULL) /* found a shorter one */
269 break;
270
271 /* despite initial appearances, there is no match here */
272 NOTE("false alarm");
273 start = m->coldp + 1; /* recycle starting later */
274 assert(start <= stop);
275 }
276
277 /* fill in the details if requested */
278 if (nmatch > 0) {
279 pmatch[0].rm_so = m->coldp - m->offp;
280 pmatch[0].rm_eo = endp - m->offp;
281 }
282 if (nmatch > 1) {
283 assert(m->pmatch != NULL);
284 for (i = 1; i < nmatch; i++)
285 if (i <= m->g->nsub)
286 pmatch[i] = m->pmatch[i];
287 else {
288 pmatch[i].rm_so = -1;
289 pmatch[i].rm_eo = -1;
290 }
291 }
292
293 if (m->pmatch != NULL)
294 free(m->pmatch);
295 if (m->lastpos != NULL)
296 free(m->lastpos);
297 STATETEARDOWN(m);
298 return(0);
299 }
300
301 /*
302 - dissect - figure out what matched what, no back references
303 == static char *dissect(struct match *m, char *start, \
304 == char *stop, sopno startst, sopno stopst);
305 */
306 static char * /* == stop (success) always */
307 dissect(m, start, stop, startst, stopst)
308 struct match *m;
309 char *start;
310 char *stop;
311 sopno startst;
312 sopno stopst;
313 {
314 int i;
315 sopno ss; /* start sop of current subRE */
316 sopno es; /* end sop of current subRE */
317 char *sp; /* start of string matched by it */
318 char *stp; /* string matched by it cannot pass here */
319 char *rest; /* start of rest of string */
320 char *tail; /* string unmatched by rest of RE */
321 sopno ssub; /* start sop of subsubRE */
322 sopno esub; /* end sop of subsubRE */
323 char *ssp; /* start of string matched by subsubRE */
324 char *sep; /* end of string matched by subsubRE */
325 char *oldssp; /* previous ssp */
326 #ifndef NDEBUG
327 char *dp;
328 #endif
329
330 AT("diss", start, stop, startst, stopst);
331 sp = start;
332 for (ss = startst; ss < stopst; ss = es) {
333 /* identify end of subRE */
334 es = ss;
335 switch (OP(m->g->strip[es])) {
336 case OPLUS_:
337 case OQUEST_:
338 es += OPND(m->g->strip[es]);
339 break;
340 case OCH_:
341 while (OP(m->g->strip[es]) != O_CH)
342 es += OPND(m->g->strip[es]);
343 break;
344 }
345 es++;
346
347 /* figure out what it matched */
348 switch (OP(m->g->strip[ss])) {
349 case OEND:
350 assert(nope);
351 break;
352 case OCHAR:
353 sp++;
354 break;
355 case OBOL:
356 case OEOL:
357 case OBOW:
358 case OEOW:
359 break;
360 case OANY:
361 case OANYOF:
362 sp++;
363 break;
364 case OBACK_:
365 case O_BACK:
366 assert(nope);
367 break;
368 /* cases where length of match is hard to find */
369 case OQUEST_:
370 stp = stop;
371 for (;;) {
372 /* how long could this one be? */
373 rest = slow(m, sp, stp, ss, es);
374 assert(rest != NULL); /* it did match */
375 /* could the rest match the rest? */
376 tail = slow(m, rest, stop, es, stopst);
377 if (tail == stop)
378 break; /* yes! */
379 /* no -- try a shorter match for this one */
380 stp = rest - 1;
381 assert(stp >= sp); /* it did work */
382 }
383 ssub = ss + 1;
384 esub = es - 1;
385 /* did innards match? */
386 if (slow(m, sp, rest, ssub, esub) != NULL) {
387 #ifdef NDEBUG
388 (void)
389 #else
390 dp =
391 #endif
392 dissect(m, sp, rest, ssub, esub);
393 assert(dp == rest);
394 } else /* no */
395 assert(sp == rest);
396 sp = rest;
397 break;
398 case OPLUS_:
399 stp = stop;
400 for (;;) {
401 /* how long could this one be? */
402 rest = slow(m, sp, stp, ss, es);
403 assert(rest != NULL); /* it did match */
404 /* could the rest match the rest? */
405 tail = slow(m, rest, stop, es, stopst);
406 if (tail == stop)
407 break; /* yes! */
408 /* no -- try a shorter match for this one */
409 stp = rest - 1;
410 assert(stp >= sp); /* it did work */
411 }
412 ssub = ss + 1;
413 esub = es - 1;
414 ssp = sp;
415 oldssp = ssp;
416 for (;;) { /* find last match of innards */
417 sep = slow(m, ssp, rest, ssub, esub);
418 if (sep == NULL || sep == ssp)
419 break; /* failed or matched null */
420 oldssp = ssp; /* on to next try */
421 ssp = sep;
422 }
423 if (sep == NULL) {
424 /* last successful match */
425 sep = ssp;
426 ssp = oldssp;
427 }
428 assert(sep == rest); /* must exhaust substring */
429 assert(slow(m, ssp, sep, ssub, esub) == rest);
430 #ifdef NDEBUG
431 (void)
432 #else
433 dp =
434 #endif
435 dissect(m, ssp, sep, ssub, esub);
436 assert(dp == sep);
437 sp = rest;
438 break;
439 case OCH_:
440 stp = stop;
441 for (;;) {
442 /* how long could this one be? */
443 rest = slow(m, sp, stp, ss, es);
444 assert(rest != NULL); /* it did match */
445 /* could the rest match the rest? */
446 tail = slow(m, rest, stop, es, stopst);
447 if (tail == stop)
448 break; /* yes! */
449 /* no -- try a shorter match for this one */
450 stp = rest - 1;
451 assert(stp >= sp); /* it did work */
452 }
453 ssub = ss + 1;
454 esub = ss + OPND(m->g->strip[ss]) - 1;
455 assert(OP(m->g->strip[esub]) == OOR1);
456 for (;;) { /* find first matching branch */
457 if (slow(m, sp, rest, ssub, esub) == rest)
458 break; /* it matched all of it */
459 /* that one missed, try next one */
460 assert(OP(m->g->strip[esub]) == OOR1);
461 esub++;
462 assert(OP(m->g->strip[esub]) == OOR2);
463 ssub = esub + 1;
464 esub += OPND(m->g->strip[esub]);
465 if (OP(m->g->strip[esub]) == OOR2)
466 esub--;
467 else
468 assert(OP(m->g->strip[esub]) == O_CH);
469 }
470 #ifdef NDEBUG
471 (void)
472 #else
473 dp =
474 #endif
475 dissect(m, sp, rest, ssub, esub);
476 assert(dp == rest);
477 sp = rest;
478 break;
479 case O_PLUS:
480 case O_QUEST:
481 case OOR1:
482 case OOR2:
483 case O_CH:
484 assert(nope);
485 break;
486 case OLPAREN:
487 i = OPND(m->g->strip[ss]);
488 assert(0 < i && i <= m->g->nsub);
489 m->pmatch[i].rm_so = sp - m->offp;
490 break;
491 case ORPAREN:
492 i = OPND(m->g->strip[ss]);
493 assert(0 < i && i <= m->g->nsub);
494 m->pmatch[i].rm_eo = sp - m->offp;
495 break;
496 default: /* uh oh */
497 assert(nope);
498 break;
499 }
500 }
501
502 assert(sp == stop);
503 return(sp);
504 }
505
506 /*
507 - backref - figure out what matched what, figuring in back references
508 == static char *backref(struct match *m, char *start, \
509 == char *stop, sopno startst, sopno stopst, sopno lev);
510 */
511 static char * /* == stop (success) or NULL (failure) */
512 backref(m, start, stop, startst, stopst, lev)
513 struct match *m;
514 char *start;
515 char *stop;
516 sopno startst;
517 sopno stopst;
518 sopno lev; /* PLUS nesting level */
519 {
520 int i;
521 sopno ss; /* start sop of current subRE */
522 char *sp; /* start of string matched by it */
523 sopno ssub; /* start sop of subsubRE */
524 sopno esub; /* end sop of subsubRE */
525 char *ssp; /* start of string matched by subsubRE */
526 char *dp;
527 size_t len;
528 int hard;
529 sop s;
530 regoff_t offsave;
531 cset *cs;
532
533 AT("back", start, stop, startst, stopst);
534 sp = start;
535
536 /* get as far as we can with easy stuff */
537 hard = 0;
538 for (ss = startst; !hard && ss < stopst; ss++)
539 switch (OP(s = m->g->strip[ss])) {
540 case OCHAR:
541 if (sp == stop || *sp++ != (char)OPND(s))
542 return(NULL);
543 break;
544 case OANY:
545 if (sp == stop)
546 return(NULL);
547 sp++;
548 break;
549 case OANYOF:
550 cs = &m->g->sets[OPND(s)];
551 if (sp == stop || !CHIN(cs, *sp++))
552 return(NULL);
553 break;
554 case OBOL:
555 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
556 (sp < m->endp && *(sp-1) == '\n' &&
557 (m->g->cflags®_NEWLINE)) )
558 { /* yes */ }
559 else
560 return(NULL);
561 break;
562 case OEOL:
563 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
564 (sp < m->endp && *sp == '\n' &&
565 (m->g->cflags®_NEWLINE)) )
566 { /* yes */ }
567 else
568 return(NULL);
569 break;
570 case OBOW:
571 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
572 (sp < m->endp && *(sp-1) == '\n' &&
573 (m->g->cflags®_NEWLINE)) ||
574 (sp > m->beginp &&
575 !ISWORD(*(sp-1))) ) &&
576 (sp < m->endp && ISWORD(*sp)) )
577 { /* yes */ }
578 else
579 return(NULL);
580 break;
581 case OEOW:
582 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
583 (sp < m->endp && *sp == '\n' &&
584 (m->g->cflags®_NEWLINE)) ||
585 (sp < m->endp && !ISWORD(*sp)) ) &&
586 (sp > m->beginp && ISWORD(*(sp-1))) )
587 { /* yes */ }
588 else
589 return(NULL);
590 break;
591 case O_QUEST:
592 break;
593 case OOR1: /* matches null but needs to skip */
594 ss++;
595 s = m->g->strip[ss];
596 do {
597 assert(OP(s) == OOR2);
598 ss += OPND(s);
599 } while (OP(s = m->g->strip[ss]) != O_CH);
600 /* note that the ss++ gets us past the O_CH */
601 break;
602 default: /* have to make a choice */
603 hard = 1;
604 break;
605 }
606 if (!hard) { /* that was it! */
607 if (sp != stop)
608 return(NULL);
609 return(sp);
610 }
611 ss--; /* adjust for the for's final increment */
612
613 /* the hard stuff */
614 AT("hard", sp, stop, ss, stopst);
615 s = m->g->strip[ss];
616 switch (OP(s)) {
617 case OBACK_: /* the vilest depths */
618 i = OPND(s);
619 assert(0 < i && i <= m->g->nsub);
620 if (m->pmatch[i].rm_eo == -1)
621 return(NULL);
622 assert(m->pmatch[i].rm_so != -1);
623 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
624 assert(stop - m->beginp >= len);
625 if (sp > stop - len)
626 return(NULL); /* not enough left to match */
627 ssp = m->offp + (size_t)m->pmatch[i].rm_so;
628 if (memcmp(sp, ssp, len) != 0)
629 return(NULL);
630 while (m->g->strip[ss] != SOP(O_BACK, i))
631 ss++;
632 return(backref(m, sp+len, stop, ss+1, stopst, lev));
633
634 case OQUEST_: /* to null or not */
635 dp = backref(m, sp, stop, ss+1, stopst, lev);
636 if (dp != NULL)
637 return(dp); /* not */
638 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
639
640 case OPLUS_:
641 assert(m->lastpos != NULL);
642 assert(lev+1 <= m->g->nplus);
643 m->lastpos[lev+1] = sp;
644 return(backref(m, sp, stop, ss+1, stopst, lev+1));
645
646 case O_PLUS:
647 if (sp == m->lastpos[lev]) /* last pass matched null */
648 return(backref(m, sp, stop, ss+1, stopst, lev-1));
649 /* try another pass */
650 m->lastpos[lev] = sp;
651 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
652 if (dp == NULL)
653 dp = backref(m, sp, stop, ss+1, stopst, lev-1);
654 return(dp);
655
656 case OCH_: /* find the right one, if any */
657 ssub = ss + 1;
658 esub = ss + OPND(s) - 1;
659 assert(OP(m->g->strip[esub]) == OOR1);
660 for (;;) { /* find first matching branch */
661 dp = backref(m, sp, stop, ssub, esub, lev);
662 if (dp != NULL)
663 return(dp);
664 /* that one missed, try next one */
665 if (OP(m->g->strip[esub]) == O_CH)
666 return(NULL); /* there is none */
667 esub++;
668 assert(OP(m->g->strip[esub]) == OOR2);
669 ssub = esub + 1;
670 esub += OPND(m->g->strip[esub]);
671 if (OP(m->g->strip[esub]) == OOR2)
672 esub--;
673 else
674 assert(OP(m->g->strip[esub]) == O_CH);
675 }
676
677 case OLPAREN: /* must undo assignment if rest fails */
678 i = OPND(s);
679 assert(0 < i && i <= m->g->nsub);
680 offsave = m->pmatch[i].rm_so;
681 m->pmatch[i].rm_so = sp - m->offp;
682 dp = backref(m, sp, stop, ss+1, stopst, lev);
683 if (dp != NULL)
684 return(dp);
685 m->pmatch[i].rm_so = offsave;
686 return(NULL);
687
688 case ORPAREN: /* must undo assignment if rest fails */
689 i = OPND(s);
690 assert(0 < i && i <= m->g->nsub);
691 offsave = m->pmatch[i].rm_eo;
692 m->pmatch[i].rm_eo = sp - m->offp;
693 dp = backref(m, sp, stop, ss+1, stopst, lev);
694 if (dp != NULL)
695 return(dp);
696 m->pmatch[i].rm_eo = offsave;
697 return(NULL);
698
699 default: /* uh oh */
700 assert(nope);
701 break;
702 }
703
704 /* "can't happen" */
705 assert(nope);
706 /* NOTREACHED */
707 return NULL;
708 }
709
710 /*
711 - fast - step through the string at top speed
712 == static char *fast(struct match *m, char *start, \
713 == char *stop, sopno startst, sopno stopst);
714 */
715 static char * /* where tentative match ended, or NULL */
716 fast(m, start, stop, startst, stopst)
717 struct match *m;
718 char *start;
719 char *stop;
720 sopno startst;
721 sopno stopst;
722 {
723 states st = m->st;
724 states fresh = m->fresh;
725 states tmp = m->tmp;
726 char *p = start;
727 int c = (start == m->beginp) ? OUT : *(start-1);
728 int lastc; /* previous c */
729 int flagch;
730 int i;
731 char *coldp; /* last p after which no match was underway */
732
733 CLEAR(st);
734 SET1(st, startst);
735 st = step(m->g, startst, stopst, st, NOTHING, st);
736 ASSIGN(fresh, st);
737 SP("start", st, *p);
738 coldp = NULL;
739 for (;;) {
740 /* next character */
741 lastc = c;
742 c = (p == m->endp) ? OUT : *p;
743 if (EQ(st, fresh))
744 coldp = p;
745
746 /* is there an EOL and/or BOL between lastc and c? */
747 flagch = '\0';
748 i = 0;
749 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
750 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
751 flagch = BOL;
752 i = m->g->nbol;
753 }
754 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
755 (c == OUT && !(m->eflags®_NOTEOL)) ) {
756 flagch = (flagch == BOL) ? BOLEOL : EOL;
757 i += m->g->neol;
758 }
759 if (i != 0) {
760 for (; i > 0; i--)
761 st = step(m->g, startst, stopst, st, flagch, st);
762 SP("boleol", st, c);
763 }
764
765 /* how about a word boundary? */
766 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
767 (c != OUT && ISWORD(c)) ) {
768 flagch = BOW;
769 }
770 if ( (lastc != OUT && ISWORD(lastc)) &&
771 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
772 flagch = EOW;
773 }
774 if (flagch == BOW || flagch == EOW) {
775 st = step(m->g, startst, stopst, st, flagch, st);
776 SP("boweow", st, c);
777 }
778
779 /* are we done? */
780 if (ISSET(st, stopst) || p == stop)
781 break; /* NOTE BREAK OUT */
782
783 /* no, we must deal with this character */
784 ASSIGN(tmp, st);
785 ASSIGN(st, fresh);
786 assert(c != OUT);
787 st = step(m->g, startst, stopst, tmp, c, st);
788 SP("aft", st, c);
789 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
790 p++;
791 }
792
793 assert(coldp != NULL);
794 m->coldp = coldp;
795 if (ISSET(st, stopst))
796 return(p+1);
797 else
798 return(NULL);
799 }
800
801 /*
802 - slow - step through the string more deliberately
803 == static char *slow(struct match *m, char *start, \
804 == char *stop, sopno startst, sopno stopst);
805 */
806 static char * /* where it ended */
807 slow(m, start, stop, startst, stopst)
808 struct match *m;
809 char *start;
810 char *stop;
811 sopno startst;
812 sopno stopst;
813 {
814 states st = m->st;
815 states empty = m->empty;
816 states tmp = m->tmp;
817 char *p = start;
818 int c = (start == m->beginp) ? OUT : *(start-1);
819 int lastc; /* previous c */
820 int flagch;
821 int i;
822 char *matchp; /* last p at which a match ended */
823
824 AT("slow", start, stop, startst, stopst);
825 CLEAR(st);
826 SET1(st, startst);
827 SP("sstart", st, *p);
828 st = step(m->g, startst, stopst, st, NOTHING, st);
829 matchp = NULL;
830 for (;;) {
831 /* next character */
832 lastc = c;
833 c = (p == m->endp) ? OUT : *p;
834
835 /* is there an EOL and/or BOL between lastc and c? */
836 flagch = '\0';
837 i = 0;
838 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
839 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
840 flagch = BOL;
841 i = m->g->nbol;
842 }
843 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
844 (c == OUT && !(m->eflags®_NOTEOL)) ) {
845 flagch = (flagch == BOL) ? BOLEOL : EOL;
846 i += m->g->neol;
847 }
848 if (i != 0) {
849 for (; i > 0; i--)
850 st = step(m->g, startst, stopst, st, flagch, st);
851 SP("sboleol", st, c);
852 }
853
854 /* how about a word boundary? */
855 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
856 (c != OUT && ISWORD(c)) ) {
857 flagch = BOW;
858 }
859 if ( (lastc != OUT && ISWORD(lastc)) &&
860 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
861 flagch = EOW;
862 }
863 if (flagch == BOW || flagch == EOW) {
864 st = step(m->g, startst, stopst, st, flagch, st);
865 SP("sboweow", st, c);
866 }
867
868 /* are we done? */
869 if (ISSET(st, stopst))
870 matchp = p;
871 if (EQ(st, empty) || p == stop)
872 break; /* NOTE BREAK OUT */
873
874 /* no, we must deal with this character */
875 ASSIGN(tmp, st);
876 ASSIGN(st, empty);
877 assert(c != OUT);
878 st = step(m->g, startst, stopst, tmp, c, st);
879 SP("saft", st, c);
880 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
881 p++;
882 }
883
884 return(matchp);
885 }
886
887
888 /*
889 - step - map set of states reachable before char to set reachable after
890 == static states step(struct re_guts *g, sopno start, sopno stop, \
891 == states bef, int ch, states aft);
892 == #define BOL (OUT+1)
893 == #define EOL (BOL+1)
894 == #define BOLEOL (BOL+2)
895 == #define NOTHING (BOL+3)
896 == #define BOW (BOL+4)
897 == #define EOW (BOL+5)
898 == #define CODEMAX (BOL+5) // highest code used
899 == #define NONCHAR(c) ((c) > CHAR_MAX)
900 == #define NNONCHAR (CODEMAX-CHAR_MAX)
901 */
902 static states
903 step(g, start, stop, bef, ch, aft)
904 struct re_guts *g;
905 sopno start; /* start state within strip */
906 sopno stop; /* state after stop state within strip */
907 states bef; /* states reachable before */
908 int ch; /* character or NONCHAR code */
909 states aft; /* states already known reachable after */
910 {
911 cset *cs;
912 sop s;
913 sopno pc;
914 onestate here; /* note, macros know this name */
915 sopno look;
916 int i;
917
918 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
919 s = g->strip[pc];
920 switch (OP(s)) {
921 case OEND:
922 assert(pc == stop-1);
923 break;
924 case OCHAR:
925 /* only characters can match */
926 assert(!NONCHAR(ch) || ch != (char)OPND(s));
927 if (ch == (char)OPND(s))
928 FWD(aft, bef, 1);
929 break;
930 case OBOL:
931 if (ch == BOL || ch == BOLEOL)
932 FWD(aft, bef, 1);
933 break;
934 case OEOL:
935 if (ch == EOL || ch == BOLEOL)
936 FWD(aft, bef, 1);
937 break;
938 case OBOW:
939 if (ch == BOW)
940 FWD(aft, bef, 1);
941 break;
942 case OEOW:
943 if (ch == EOW)
944 FWD(aft, bef, 1);
945 break;
946 case OANY:
947 if (!NONCHAR(ch))
948 FWD(aft, bef, 1);
949 break;
950 case OANYOF:
951 cs = &g->sets[OPND(s)];
952 if (!NONCHAR(ch) && CHIN(cs, ch))
953 FWD(aft, bef, 1);
954 break;
955 case OBACK_: /* ignored here */
956 case O_BACK:
957 FWD(aft, aft, 1);
958 break;
959 case OPLUS_: /* forward, this is just an empty */
960 FWD(aft, aft, 1);
961 break;
962 case O_PLUS: /* both forward and back */
963 FWD(aft, aft, 1);
964 i = ISSETBACK(aft, OPND(s));
965 BACK(aft, aft, OPND(s));
966 if (!i && ISSETBACK(aft, OPND(s))) {
967 /* oho, must reconsider loop body */
968 pc -= OPND(s) + 1;
969 INIT(here, pc);
970 }
971 break;
972 case OQUEST_: /* two branches, both forward */
973 FWD(aft, aft, 1);
974 FWD(aft, aft, OPND(s));
975 break;
976 case O_QUEST: /* just an empty */
977 FWD(aft, aft, 1);
978 break;
979 case OLPAREN: /* not significant here */
980 case ORPAREN:
981 FWD(aft, aft, 1);
982 break;
983 case OCH_: /* mark the first two branches */
984 FWD(aft, aft, 1);
985 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
986 FWD(aft, aft, OPND(s));
987 break;
988 case OOR1: /* done a branch, find the O_CH */
989 if (ISSTATEIN(aft, here)) {
990 for (look = 1;
991 OP(s = g->strip[pc+look]) != O_CH;
992 look += OPND(s))
993 assert(OP(s) == OOR2);
994 FWD(aft, aft, look);
995 }
996 break;
997 case OOR2: /* propagate OCH_'s marking */
998 FWD(aft, aft, 1);
999 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1000 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1001 FWD(aft, aft, OPND(s));
1002 }
1003 break;
1004 case O_CH: /* just empty */
1005 FWD(aft, aft, 1);
1006 break;
1007 default: /* ooooops... */
1008 assert(nope);
1009 break;
1010 }
1011 }
1012
1013 return(aft);
1014 }
1015
1016 #ifdef REDEBUG
1017 /*
1018 - print - print a set of states
1019 == #ifdef REDEBUG
1020 == static void print(struct match *m, char *caption, states st, \
1021 == int ch, FILE *d);
1022 == #endif
1023 */
1024 static void
1025 print(m, caption, st, ch, d)
1026 struct match *m;
1027 char *caption;
1028 states st;
1029 int ch;
1030 FILE *d;
1031 {
1032 struct re_guts *g = m->g;
1033 int i;
1034 int first = 1;
1035
1036 if (!(m->eflags®_TRACE))
1037 return;
1038
1039 fprintf(d, "%s", caption);
1040 if (ch != '\0')
1041 fprintf(d, " %s", pchar(ch));
1042 for (i = 0; i < g->nstates; i++)
1043 if (ISSET(st, i)) {
1044 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1045 first = 0;
1046 }
1047 fprintf(d, "\n");
1048 }
1049
1050 /*
1051 - at - print current situation
1052 == #ifdef REDEBUG
1053 == static void at(struct match *m, char *title, char *start, char *stop, \
1054 == sopno startst, sopno stopst);
1055 == #endif
1056 */
1057 static void
1058 at(m, title, start, stop, startst, stopst)
1059 struct match *m;
1060 char *title;
1061 char *start;
1062 char *stop;
1063 sopno startst;
1064 sopno stopst;
1065 {
1066 if (!(m->eflags®_TRACE))
1067 return;
1068
1069 printf("%s %s-", title, pchar(*start));
1070 printf("%s ", pchar(*stop));
1071 printf("%ld-%ld\n", (long)startst, (long)stopst);
1072 }
1073
1074 #ifndef PCHARDONE
1075 #define PCHARDONE /* never again */
1076 /*
1077 - pchar - make a character printable
1078 == #ifdef REDEBUG
1079 == static char *pchar(int ch);
1080 == #endif
1081 *
1082 * Is this identical to regchar() over in debug.c? Well, yes. But a
1083 * duplicate here avoids having a debugging-capable regexec.o tied to
1084 * a matching debug.o, and this is convenient. It all disappears in
1085 * the non-debug compilation anyway, so it doesn't matter much.
1086 */
1087 static char * /* -> representation */
1088 pchar(ch)
1089 int ch;
1090 {
1091 static char pbuf[10];
1092
1093 if (isprint(ch) || ch == ' ')
1094 (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1095 else
1096 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1097 return(pbuf);
1098 }
1099 #endif
1100 #endif
1101
1102 #undef matcher
1103 #undef fast
1104 #undef slow
1105 #undef dissect
1106 #undef backref
1107 #undef step
1108 #undef print
1109 #undef at
1110 #undef match
1111