engine.c revision 1.16 1 /* $NetBSD: engine.c,v 1.16 2004/03/12 22:34:09 enami Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)engine.c 8.5 (Berkeley) 3/20/94
35 */
36
37 /*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 * @(#)engine.c 8.5 (Berkeley) 3/20/94
72 */
73
74 /*
75 * The matching engine and friends. This file is #included by regexec.c
76 * after suitable #defines of a variety of macros used herein, so that
77 * different state representations can be used without duplicating masses
78 * of code.
79 */
80
81 #ifdef SNAMES
82 #define matcher smatcher
83 #define fast sfast
84 #define slow sslow
85 #define dissect sdissect
86 #define backref sbackref
87 #define step sstep
88 #define print sprint
89 #define at sat
90 #define match smat
91 #define nope snope
92 #endif
93 #ifdef LNAMES
94 #define matcher lmatcher
95 #define fast lfast
96 #define slow lslow
97 #define dissect ldissect
98 #define backref lbackref
99 #define step lstep
100 #define print lprint
101 #define at lat
102 #define match lmat
103 #define nope lnope
104 #endif
105
106 /* another structure passed up and down to avoid zillions of parameters */
107 struct match {
108 struct re_guts *g;
109 int eflags;
110 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
111 char *offp; /* offsets work from here */
112 char *beginp; /* start of string -- virtual NUL precedes */
113 char *endp; /* end of string -- virtual NUL here */
114 char *coldp; /* can be no match starting before here */
115 char **lastpos; /* [nplus+1] */
116 STATEVARS;
117 states st; /* current states */
118 states fresh; /* states for a fresh start */
119 states tmp; /* temporary */
120 states empty; /* empty set of states */
121 };
122
123 /* ========= begin header generated by ./mkh ========= */
124 #ifdef __cplusplus
125 extern "C" {
126 #endif
127
128 /* === engine.c === */
129 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
130 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
131 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
132 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
133 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
134 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
135 #define BOL (OUT+1)
136 #define EOL (BOL+1)
137 #define BOLEOL (BOL+2)
138 #define NOTHING (BOL+3)
139 #define BOW (BOL+4)
140 #define EOW (BOL+5)
141 #define CODEMAX (BOL+5) /* highest code used */
142 #define NONCHAR(c) ((c) > CHAR_MAX)
143 #define NNONCHAR (CODEMAX-CHAR_MAX)
144 #ifdef REDEBUG
145 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
146 #endif
147 #ifdef REDEBUG
148 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
149 #endif
150 #ifdef REDEBUG
151 static char *pchar __P((int ch));
152 #endif
153
154 #ifdef __cplusplus
155 }
156 #endif
157 /* ========= end header generated by ./mkh ========= */
158
159 #ifdef REDEBUG
160 #define SP(t, s, c) print(m, t, s, c, stdout)
161 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
162 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
163 static int nope = 0;
164 #else
165 #define SP(t, s, c) /* nothing */
166 #define AT(t, p1, p2, s1, s2) /* nothing */
167 #define NOTE(s) /* nothing */
168 #endif
169
170 /*
171 - matcher - the actual matching engine
172 == static int matcher(struct re_guts *g, char *string, \
173 == size_t nmatch, regmatch_t pmatch[], int eflags);
174 */
175 static int /* 0 success, REG_NOMATCH failure */
176 matcher(g, string, nmatch, pmatch, eflags)
177 struct re_guts *g;
178 char *string;
179 size_t nmatch;
180 regmatch_t pmatch[];
181 int eflags;
182 {
183 char *endp;
184 int i;
185 struct match mv;
186 struct match *m = &mv;
187 char *dp;
188 const sopno gf = g->firststate+1; /* +1 for OEND */
189 const sopno gl = g->laststate;
190 char *start;
191 char *stop;
192 int error = 0;
193
194 _DIAGASSERT(g != NULL);
195 _DIAGASSERT(string != NULL);
196 /* pmatch checked below */
197
198 /* simplify the situation where possible */
199 if (g->cflags®_NOSUB)
200 nmatch = 0;
201 if (eflags®_STARTEND) {
202 _DIAGASSERT(pmatch != NULL);
203 start = string + (size_t)pmatch[0].rm_so;
204 stop = string + (size_t)pmatch[0].rm_eo;
205 } else {
206 start = string;
207 stop = start + strlen(start);
208 }
209 if (stop < start)
210 return(REG_INVARG);
211
212 /* prescreening; this does wonders for this rather slow code */
213 if (g->must != NULL) {
214 for (dp = start; dp < stop; dp++)
215 if (*dp == g->must[0] && stop - dp >= g->mlen &&
216 memcmp(dp, g->must, (size_t)g->mlen) == 0)
217 break;
218 if (dp == stop) /* we didn't find g->must */
219 return(REG_NOMATCH);
220 }
221
222 /* match struct setup */
223 m->g = g;
224 m->eflags = eflags;
225 m->pmatch = NULL;
226 m->lastpos = NULL;
227 m->offp = string;
228 m->beginp = start;
229 m->endp = stop;
230 STATESETUP(m, 4);
231 SETUP(m->st);
232 SETUP(m->fresh);
233 SETUP(m->tmp);
234 SETUP(m->empty);
235 CLEAR(m->empty);
236
237 /* this loop does only one repetition except for backrefs */
238 for (;;) {
239 endp = fast(m, start, stop, gf, gl);
240 if (endp == NULL) { /* a miss */
241 error = REG_NOMATCH;
242 goto done;
243 }
244 if (nmatch == 0 && !g->backrefs)
245 break; /* no further info needed */
246
247 /* where? */
248 assert(m->coldp != NULL);
249 for (;;) {
250 NOTE("finding start");
251 endp = slow(m, m->coldp, stop, gf, gl);
252 if (endp != NULL)
253 break;
254 assert(m->coldp < m->endp);
255 m->coldp++;
256 }
257 if (nmatch == 1 && !g->backrefs)
258 break; /* no further info needed */
259
260 /* oh my, he wants the subexpressions... */
261 if (m->pmatch == NULL)
262 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
263 sizeof(regmatch_t));
264 if (m->pmatch == NULL) {
265 error = REG_ESPACE;
266 goto done;
267 }
268 for (i = 1; i <= m->g->nsub; i++)
269 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
270 if (!g->backrefs && !(m->eflags®_BACKR)) {
271 NOTE("dissecting");
272 dp = dissect(m, m->coldp, endp, gf, gl);
273 } else {
274 if (g->nplus > 0 && m->lastpos == NULL)
275 m->lastpos = (char **)malloc((g->nplus+1) *
276 sizeof(char *));
277 if (g->nplus > 0 && m->lastpos == NULL) {
278 error = REG_ESPACE;
279 goto done;
280 }
281 NOTE("backref dissect");
282 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
283 }
284 if (dp != NULL)
285 break;
286
287 /* uh-oh... we couldn't find a subexpression-level match */
288 assert(g->backrefs); /* must be back references doing it */
289 assert(g->nplus == 0 || m->lastpos != NULL);
290 for (;;) {
291 if (dp != NULL || endp <= m->coldp)
292 break; /* defeat */
293 NOTE("backoff");
294 endp = slow(m, m->coldp, endp-1, gf, gl);
295 if (endp == NULL)
296 break; /* defeat */
297 /* try it on a shorter possibility */
298 #ifndef NDEBUG
299 for (i = 1; i <= m->g->nsub; i++) {
300 assert(m->pmatch[i].rm_so == (regoff_t)-1);
301 assert(m->pmatch[i].rm_eo == (regoff_t)-1);
302 }
303 #endif
304 NOTE("backoff dissect");
305 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
306 }
307 assert(dp == NULL || dp == endp);
308 if (dp != NULL) /* found a shorter one */
309 break;
310
311 /* despite initial appearances, there is no match here */
312 NOTE("false alarm");
313 start = m->coldp + 1; /* recycle starting later */
314 assert(start <= stop);
315 }
316
317 /* fill in the details if requested */
318 if (nmatch > 0) {
319 _DIAGASSERT(pmatch != NULL);
320 pmatch[0].rm_so = m->coldp - m->offp;
321 pmatch[0].rm_eo = endp - m->offp;
322 }
323 if (nmatch > 1) {
324 assert(m->pmatch != NULL);
325 for (i = 1; i < nmatch; i++)
326 if (i <= m->g->nsub)
327 pmatch[i] = m->pmatch[i];
328 else {
329 pmatch[i].rm_so = (regoff_t)-1;
330 pmatch[i].rm_eo = (regoff_t)-1;
331 }
332 }
333
334 done:
335 if (m->pmatch != NULL) {
336 free(m->pmatch);
337 m->pmatch = NULL;
338 }
339 if (m->lastpos != NULL) {
340 free(m->lastpos);
341 m->lastpos = NULL;
342 }
343 STATETEARDOWN(m);
344 return error;
345 }
346
347 /*
348 - dissect - figure out what matched what, no back references
349 == static char *dissect(struct match *m, char *start, \
350 == char *stop, sopno startst, sopno stopst);
351 */
352 static char * /* == stop (success) always */
353 dissect(m, start, stop, startst, stopst)
354 struct match *m;
355 char *start;
356 char *stop;
357 sopno startst;
358 sopno stopst;
359 {
360 int i;
361 sopno ss; /* start sop of current subRE */
362 sopno es; /* end sop of current subRE */
363 char *sp; /* start of string matched by it */
364 char *stp; /* string matched by it cannot pass here */
365 char *rest; /* start of rest of string */
366 char *tail; /* string unmatched by rest of RE */
367 sopno ssub; /* start sop of subsubRE */
368 sopno esub; /* end sop of subsubRE */
369 char *ssp; /* start of string matched by subsubRE */
370 char *sep; /* end of string matched by subsubRE */
371 char *oldssp; /* previous ssp */
372 #ifndef NDEBUG
373 char *dp;
374 #endif
375
376 _DIAGASSERT(m != NULL);
377 _DIAGASSERT(start != NULL);
378 _DIAGASSERT(stop != NULL);
379
380 AT("diss", start, stop, startst, stopst);
381 sp = start;
382 for (ss = startst; ss < stopst; ss = es) {
383 /* identify end of subRE */
384 es = ss;
385 switch (OP(m->g->strip[es])) {
386 case OPLUS_:
387 case OQUEST_:
388 es += OPND(m->g->strip[es]);
389 break;
390 case OCH_:
391 while (OP(m->g->strip[es]) != O_CH)
392 es += OPND(m->g->strip[es]);
393 break;
394 }
395 es++;
396
397 /* figure out what it matched */
398 switch (OP(m->g->strip[ss])) {
399 case OEND:
400 assert(nope);
401 break;
402 case OCHAR:
403 sp++;
404 break;
405 case OBOL:
406 case OEOL:
407 case OBOW:
408 case OEOW:
409 break;
410 case OANY:
411 case OANYOF:
412 sp++;
413 break;
414 case OBACK_:
415 case O_BACK:
416 assert(nope);
417 break;
418 /* cases where length of match is hard to find */
419 case OQUEST_:
420 stp = stop;
421 for (;;) {
422 /* how long could this one be? */
423 rest = slow(m, sp, stp, ss, es);
424 assert(rest != NULL); /* it did match */
425 /* could the rest match the rest? */
426 tail = slow(m, rest, stop, es, stopst);
427 if (tail == stop)
428 break; /* yes! */
429 /* no -- try a shorter match for this one */
430 stp = rest - 1;
431 assert(stp >= sp); /* it did work */
432 }
433 ssub = ss + 1;
434 esub = es - 1;
435 /* did innards match? */
436 if (slow(m, sp, rest, ssub, esub) != NULL) {
437 #ifdef NDEBUG
438 (void)
439 #else
440 dp =
441 #endif
442 dissect(m, sp, rest, ssub, esub);
443 assert(dp == rest);
444 } else /* no */
445 assert(sp == rest);
446 sp = rest;
447 break;
448 case OPLUS_:
449 stp = stop;
450 for (;;) {
451 /* how long could this one be? */
452 rest = slow(m, sp, stp, ss, es);
453 assert(rest != NULL); /* it did match */
454 /* could the rest match the rest? */
455 tail = slow(m, rest, stop, es, stopst);
456 if (tail == stop)
457 break; /* yes! */
458 /* no -- try a shorter match for this one */
459 stp = rest - 1;
460 assert(stp >= sp); /* it did work */
461 }
462 ssub = ss + 1;
463 esub = es - 1;
464 ssp = sp;
465 oldssp = ssp;
466 for (;;) { /* find last match of innards */
467 sep = slow(m, ssp, rest, ssub, esub);
468 if (sep == NULL || sep == ssp)
469 break; /* failed or matched null */
470 oldssp = ssp; /* on to next try */
471 ssp = sep;
472 }
473 if (sep == NULL) {
474 /* last successful match */
475 sep = ssp;
476 ssp = oldssp;
477 }
478 assert(sep == rest); /* must exhaust substring */
479 assert(slow(m, ssp, sep, ssub, esub) == rest);
480 #ifdef NDEBUG
481 (void)
482 #else
483 dp =
484 #endif
485 dissect(m, ssp, sep, ssub, esub);
486 assert(dp == sep);
487 sp = rest;
488 break;
489 case OCH_:
490 stp = stop;
491 for (;;) {
492 /* how long could this one be? */
493 rest = slow(m, sp, stp, ss, es);
494 assert(rest != NULL); /* it did match */
495 /* could the rest match the rest? */
496 tail = slow(m, rest, stop, es, stopst);
497 if (tail == stop)
498 break; /* yes! */
499 /* no -- try a shorter match for this one */
500 stp = rest - 1;
501 assert(stp >= sp); /* it did work */
502 }
503 ssub = ss + 1;
504 esub = ss + OPND(m->g->strip[ss]) - 1;
505 assert(OP(m->g->strip[esub]) == OOR1);
506 for (;;) { /* find first matching branch */
507 if (slow(m, sp, rest, ssub, esub) == rest)
508 break; /* it matched all of it */
509 /* that one missed, try next one */
510 assert(OP(m->g->strip[esub]) == OOR1);
511 esub++;
512 assert(OP(m->g->strip[esub]) == OOR2);
513 ssub = esub + 1;
514 esub += OPND(m->g->strip[esub]);
515 if (OP(m->g->strip[esub]) == OOR2)
516 esub--;
517 else
518 assert(OP(m->g->strip[esub]) == O_CH);
519 }
520 #ifdef NDEBUG
521 (void)
522 #else
523 dp =
524 #endif
525 dissect(m, sp, rest, ssub, esub);
526 assert(dp == rest);
527 sp = rest;
528 break;
529 case O_PLUS:
530 case O_QUEST:
531 case OOR1:
532 case OOR2:
533 case O_CH:
534 assert(nope);
535 break;
536 case OLPAREN:
537 i = OPND(m->g->strip[ss]);
538 assert(0 < i && i <= m->g->nsub);
539 m->pmatch[i].rm_so = sp - m->offp;
540 break;
541 case ORPAREN:
542 i = OPND(m->g->strip[ss]);
543 assert(0 < i && i <= m->g->nsub);
544 m->pmatch[i].rm_eo = sp - m->offp;
545 break;
546 default: /* uh oh */
547 assert(nope);
548 break;
549 }
550 }
551
552 assert(sp == stop);
553 return(sp);
554 }
555
556 /*
557 - backref - figure out what matched what, figuring in back references
558 == static char *backref(struct match *m, char *start, \
559 == char *stop, sopno startst, sopno stopst, sopno lev);
560 */
561 static char * /* == stop (success) or NULL (failure) */
562 backref(m, start, stop, startst, stopst, lev)
563 struct match *m;
564 char *start;
565 char *stop;
566 sopno startst;
567 sopno stopst;
568 sopno lev; /* PLUS nesting level */
569 {
570 int i;
571 sopno ss; /* start sop of current subRE */
572 char *sp; /* start of string matched by it */
573 sopno ssub; /* start sop of subsubRE */
574 sopno esub; /* end sop of subsubRE */
575 char *ssp; /* start of string matched by subsubRE */
576 char *dp;
577 size_t len;
578 int hard;
579 sop s;
580 regoff_t offsave;
581 cset *cs;
582
583 _DIAGASSERT(m != NULL);
584 _DIAGASSERT(start != NULL);
585 _DIAGASSERT(stop != NULL);
586
587 AT("back", start, stop, startst, stopst);
588 sp = start;
589
590 /* get as far as we can with easy stuff */
591 hard = 0;
592 for (ss = startst; !hard && ss < stopst; ss++)
593 switch (OP(s = m->g->strip[ss])) {
594 case OCHAR:
595 if (sp == stop || *sp++ != (char)OPND(s))
596 return(NULL);
597 break;
598 case OANY:
599 if (sp == stop)
600 return(NULL);
601 sp++;
602 break;
603 case OANYOF:
604 cs = &m->g->sets[OPND(s)];
605 if (sp == stop || !CHIN(cs, *sp++))
606 return(NULL);
607 break;
608 case OBOL:
609 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
610 (sp < m->endp && *(sp-1) == '\n' &&
611 (m->g->cflags®_NEWLINE)) )
612 { /* yes */ }
613 else
614 return(NULL);
615 break;
616 case OEOL:
617 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
618 (sp < m->endp && *sp == '\n' &&
619 (m->g->cflags®_NEWLINE)) )
620 { /* yes */ }
621 else
622 return(NULL);
623 break;
624 case OBOW:
625 if (((sp == m->beginp && !(m->eflags®_NOTBOL)) ||
626 (sp < m->endp && *(sp - 1) == '\n' &&
627 (m->g->cflags®_NEWLINE)) ||
628 (sp > m->beginp &&
629 !ISWORD(*(unsigned char *)(sp - 1)))) &&
630 (sp < m->endp && ISWORD(*(unsigned char *)sp)))
631 { /* yes */ }
632 else
633 return(NULL);
634 break;
635 case OEOW:
636 if (((sp == m->endp && !(m->eflags®_NOTEOL)) ||
637 (sp < m->endp && *sp == '\n' &&
638 (m->g->cflags®_NEWLINE)) ||
639 (sp < m->endp && !ISWORD(*(unsigned char *)sp))) &&
640 (sp > m->beginp &&
641 ISWORD(*(unsigned char *)(sp - 1))))
642 { /* yes */ }
643 else
644 return(NULL);
645 break;
646 case O_QUEST:
647 break;
648 case OOR1: /* matches null but needs to skip */
649 ss++;
650 s = m->g->strip[ss];
651 do {
652 assert(OP(s) == OOR2);
653 ss += OPND(s);
654 } while (OP(s = m->g->strip[ss]) != O_CH);
655 /* note that the ss++ gets us past the O_CH */
656 break;
657 default: /* have to make a choice */
658 hard = 1;
659 break;
660 }
661 if (!hard) { /* that was it! */
662 if (sp != stop)
663 return(NULL);
664 return(sp);
665 }
666 ss--; /* adjust for the for's final increment */
667
668 /* the hard stuff */
669 AT("hard", sp, stop, ss, stopst);
670 s = m->g->strip[ss];
671 switch (OP(s)) {
672 case OBACK_: /* the vilest depths */
673 i = OPND(s);
674 assert(0 < i && i <= m->g->nsub);
675 if (m->pmatch[i].rm_eo == (regoff_t)-1)
676 return(NULL);
677 assert(m->pmatch[i].rm_so != (regoff_t)-1);
678 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
679 assert(stop - m->beginp >= len);
680 if (sp > stop - len)
681 return(NULL); /* not enough left to match */
682 ssp = m->offp + (size_t)m->pmatch[i].rm_so;
683 if (memcmp(sp, ssp, len) != 0)
684 return(NULL);
685 while (m->g->strip[ss] != SOP(O_BACK, i))
686 ss++;
687 return(backref(m, sp+len, stop, ss+1, stopst, lev));
688
689 case OQUEST_: /* to null or not */
690 dp = backref(m, sp, stop, ss+1, stopst, lev);
691 if (dp != NULL)
692 return(dp); /* not */
693 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
694
695 case OPLUS_:
696 assert(m->lastpos != NULL);
697 assert(lev+1 <= m->g->nplus);
698 m->lastpos[lev+1] = sp;
699 return(backref(m, sp, stop, ss+1, stopst, lev+1));
700
701 case O_PLUS:
702 if (sp == m->lastpos[lev]) /* last pass matched null */
703 return(backref(m, sp, stop, ss+1, stopst, lev-1));
704 /* try another pass */
705 m->lastpos[lev] = sp;
706 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
707 if (dp == NULL)
708 dp = backref(m, sp, stop, ss+1, stopst, lev-1);
709 return(dp);
710
711 case OCH_: /* find the right one, if any */
712 ssub = ss + 1;
713 esub = ss + OPND(s) - 1;
714 assert(OP(m->g->strip[esub]) == OOR1);
715 for (;;) { /* find first matching branch */
716 dp = backref(m, sp, stop, ssub, esub, lev);
717 if (dp != NULL)
718 return(dp);
719 /* that one missed, try next one */
720 if (OP(m->g->strip[esub]) == O_CH)
721 return(NULL); /* there is none */
722 esub++;
723 assert(OP(m->g->strip[esub]) == OOR2);
724 ssub = esub + 1;
725 esub += OPND(m->g->strip[esub]);
726 if (OP(m->g->strip[esub]) == OOR2)
727 esub--;
728 else
729 assert(OP(m->g->strip[esub]) == O_CH);
730 }
731
732 case OLPAREN: /* must undo assignment if rest fails */
733 i = OPND(s);
734 assert(0 < i && i <= m->g->nsub);
735 offsave = m->pmatch[i].rm_so;
736 m->pmatch[i].rm_so = sp - m->offp;
737 dp = backref(m, sp, stop, ss+1, stopst, lev);
738 if (dp != NULL)
739 return(dp);
740 m->pmatch[i].rm_so = offsave;
741 return(NULL);
742
743 case ORPAREN: /* must undo assignment if rest fails */
744 i = OPND(s);
745 assert(0 < i && i <= m->g->nsub);
746 offsave = m->pmatch[i].rm_eo;
747 m->pmatch[i].rm_eo = sp - m->offp;
748 dp = backref(m, sp, stop, ss+1, stopst, lev);
749 if (dp != NULL)
750 return(dp);
751 m->pmatch[i].rm_eo = offsave;
752 return(NULL);
753
754 default: /* uh oh */
755 assert(nope);
756 break;
757 }
758
759 /* "can't happen" */
760 assert(nope);
761 /* NOTREACHED */
762 return NULL;
763 }
764
765 /*
766 - fast - step through the string at top speed
767 == static char *fast(struct match *m, char *start, \
768 == char *stop, sopno startst, sopno stopst);
769 */
770 static char * /* where tentative match ended, or NULL */
771 fast(m, start, stop, startst, stopst)
772 struct match *m;
773 char *start;
774 char *stop;
775 sopno startst;
776 sopno stopst;
777 {
778 states st = m->st;
779 states fresh = m->fresh;
780 states tmp = m->tmp;
781 char *p = start;
782 int c = (start == m->beginp) ? OUT : *(unsigned char *)(start - 1);
783 int lastc; /* previous c */
784 int flagch;
785 int i;
786 char *coldp; /* last p after which no match was underway */
787
788 _DIAGASSERT(m != NULL);
789 _DIAGASSERT(start != NULL);
790 _DIAGASSERT(stop != NULL);
791
792 CLEAR(st);
793 SET1(st, startst);
794 st = step(m->g, startst, stopst, st, NOTHING, st);
795 ASSIGN(fresh, st);
796 SP("start", st, *p);
797 coldp = NULL;
798 for (;;) {
799 /* next character */
800 lastc = c;
801 c = (p == m->endp) ? OUT : *(unsigned char *)p;
802 if (EQ(st, fresh))
803 coldp = p;
804
805 /* is there an EOL and/or BOL between lastc and c? */
806 flagch = '\0';
807 i = 0;
808 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
809 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
810 flagch = BOL;
811 i = m->g->nbol;
812 }
813 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
814 (c == OUT && !(m->eflags®_NOTEOL)) ) {
815 flagch = (flagch == BOL) ? BOLEOL : EOL;
816 i += m->g->neol;
817 }
818 if (i != 0) {
819 for (; i > 0; i--)
820 st = step(m->g, startst, stopst, st, flagch, st);
821 SP("boleol", st, c);
822 }
823
824 /* how about a word boundary? */
825 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
826 (c != OUT && ISWORD(c)) ) {
827 flagch = BOW;
828 }
829 if ( (lastc != OUT && ISWORD(lastc)) &&
830 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
831 flagch = EOW;
832 }
833 if (flagch == BOW || flagch == EOW) {
834 st = step(m->g, startst, stopst, st, flagch, st);
835 SP("boweow", st, c);
836 }
837
838 /* are we done? */
839 if (ISSET(st, stopst) || p == stop)
840 break; /* NOTE BREAK OUT */
841
842 /* no, we must deal with this character */
843 ASSIGN(tmp, st);
844 ASSIGN(st, fresh);
845 assert(c != OUT);
846 st = step(m->g, startst, stopst, tmp, c, st);
847 SP("aft", st, c);
848 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
849 p++;
850 }
851
852 assert(coldp != NULL);
853 m->coldp = coldp;
854 if (ISSET(st, stopst))
855 return(p+1);
856 else
857 return(NULL);
858 }
859
860 /*
861 - slow - step through the string more deliberately
862 == static char *slow(struct match *m, char *start, \
863 == char *stop, sopno startst, sopno stopst);
864 */
865 static char * /* where it ended */
866 slow(m, start, stop, startst, stopst)
867 struct match *m;
868 char *start;
869 char *stop;
870 sopno startst;
871 sopno stopst;
872 {
873 states st = m->st;
874 states empty = m->empty;
875 states tmp = m->tmp;
876 char *p = start;
877 int c = (start == m->beginp) ? OUT : *(unsigned char *)(start - 1);
878 int lastc; /* previous c */
879 int flagch;
880 int i;
881 char *matchp; /* last p at which a match ended */
882
883 _DIAGASSERT(m != NULL);
884 _DIAGASSERT(start != NULL);
885 _DIAGASSERT(stop != NULL);
886
887 AT("slow", start, stop, startst, stopst);
888 CLEAR(st);
889 SET1(st, startst);
890 SP("sstart", st, *p);
891 st = step(m->g, startst, stopst, st, NOTHING, st);
892 matchp = NULL;
893 for (;;) {
894 /* next character */
895 lastc = c;
896 c = (p == m->endp) ? OUT : *(unsigned char *)p;
897
898 /* is there an EOL and/or BOL between lastc and c? */
899 flagch = '\0';
900 i = 0;
901 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
902 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
903 flagch = BOL;
904 i = m->g->nbol;
905 }
906 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
907 (c == OUT && !(m->eflags®_NOTEOL)) ) {
908 flagch = (flagch == BOL) ? BOLEOL : EOL;
909 i += m->g->neol;
910 }
911 if (i != 0) {
912 for (; i > 0; i--)
913 st = step(m->g, startst, stopst, st, flagch, st);
914 SP("sboleol", st, c);
915 }
916
917 /* how about a word boundary? */
918 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
919 (c != OUT && ISWORD(c)) ) {
920 flagch = BOW;
921 }
922 if ( (lastc != OUT && ISWORD(lastc)) &&
923 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
924 flagch = EOW;
925 }
926 if (flagch == BOW || flagch == EOW) {
927 st = step(m->g, startst, stopst, st, flagch, st);
928 SP("sboweow", st, c);
929 }
930
931 /* are we done? */
932 if (ISSET(st, stopst))
933 matchp = p;
934 if (EQ(st, empty) || p == stop)
935 break; /* NOTE BREAK OUT */
936
937 /* no, we must deal with this character */
938 ASSIGN(tmp, st);
939 ASSIGN(st, empty);
940 assert(c != OUT);
941 st = step(m->g, startst, stopst, tmp, c, st);
942 SP("saft", st, c);
943 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
944 p++;
945 }
946
947 return(matchp);
948 }
949
950
951 /*
952 - step - map set of states reachable before char to set reachable after
953 == static states step(struct re_guts *g, sopno start, sopno stop, \
954 == states bef, int ch, states aft);
955 == #define BOL (OUT+1)
956 == #define EOL (BOL+1)
957 == #define BOLEOL (BOL+2)
958 == #define NOTHING (BOL+3)
959 == #define BOW (BOL+4)
960 == #define EOW (BOL+5)
961 == #define CODEMAX (BOL+5) // highest code used
962 == #define NONCHAR(c) ((c) > CHAR_MAX)
963 == #define NNONCHAR (CODEMAX-CHAR_MAX)
964 */
965 static states
966 step(g, start, stop, bef, ch, aft)
967 struct re_guts *g;
968 sopno start; /* start state within strip */
969 sopno stop; /* state after stop state within strip */
970 states bef; /* states reachable before */
971 int ch; /* character or NONCHAR code */
972 states aft; /* states already known reachable after */
973 {
974 cset *cs;
975 sop s;
976 sopno pc;
977 onestate here; /* note, macros know this name */
978 sopno look;
979 int i;
980
981 _DIAGASSERT(g != NULL);
982
983 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
984 s = g->strip[pc];
985 switch (OP(s)) {
986 case OEND:
987 assert(pc == stop-1);
988 break;
989 case OCHAR:
990 /* only characters can match */
991 assert(!NONCHAR(ch) || ch != (char)OPND(s));
992 if (ch == (char)OPND(s))
993 FWD(aft, bef, 1);
994 break;
995 case OBOL:
996 if (ch == BOL || ch == BOLEOL)
997 FWD(aft, bef, 1);
998 break;
999 case OEOL:
1000 if (ch == EOL || ch == BOLEOL)
1001 FWD(aft, bef, 1);
1002 break;
1003 case OBOW:
1004 if (ch == BOW)
1005 FWD(aft, bef, 1);
1006 break;
1007 case OEOW:
1008 if (ch == EOW)
1009 FWD(aft, bef, 1);
1010 break;
1011 case OANY:
1012 if (!NONCHAR(ch))
1013 FWD(aft, bef, 1);
1014 break;
1015 case OANYOF:
1016 cs = &g->sets[OPND(s)];
1017 if (!NONCHAR(ch) && CHIN(cs, ch))
1018 FWD(aft, bef, 1);
1019 break;
1020 case OBACK_: /* ignored here */
1021 case O_BACK:
1022 FWD(aft, aft, 1);
1023 break;
1024 case OPLUS_: /* forward, this is just an empty */
1025 FWD(aft, aft, 1);
1026 break;
1027 case O_PLUS: /* both forward and back */
1028 FWD(aft, aft, 1);
1029 i = ISSETBACK(aft, OPND(s));
1030 BACK(aft, aft, OPND(s));
1031 if (!i && ISSETBACK(aft, OPND(s))) {
1032 /* oho, must reconsider loop body */
1033 pc -= OPND(s) + 1;
1034 INIT(here, pc);
1035 }
1036 break;
1037 case OQUEST_: /* two branches, both forward */
1038 FWD(aft, aft, 1);
1039 FWD(aft, aft, OPND(s));
1040 break;
1041 case O_QUEST: /* just an empty */
1042 FWD(aft, aft, 1);
1043 break;
1044 case OLPAREN: /* not significant here */
1045 case ORPAREN:
1046 FWD(aft, aft, 1);
1047 break;
1048 case OCH_: /* mark the first two branches */
1049 FWD(aft, aft, 1);
1050 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1051 FWD(aft, aft, OPND(s));
1052 break;
1053 case OOR1: /* done a branch, find the O_CH */
1054 if (ISSTATEIN(aft, here)) {
1055 for (look = 1;
1056 OP(s = g->strip[pc+look]) != O_CH;
1057 look += OPND(s))
1058 assert(OP(s) == OOR2);
1059 FWD(aft, aft, look);
1060 }
1061 break;
1062 case OOR2: /* propagate OCH_'s marking */
1063 FWD(aft, aft, 1);
1064 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1065 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1066 FWD(aft, aft, OPND(s));
1067 }
1068 break;
1069 case O_CH: /* just empty */
1070 FWD(aft, aft, 1);
1071 break;
1072 default: /* ooooops... */
1073 assert(nope);
1074 break;
1075 }
1076 }
1077
1078 return(aft);
1079 }
1080
1081 #ifdef REDEBUG
1082 /*
1083 - print - print a set of states
1084 == #ifdef REDEBUG
1085 == static void print(struct match *m, char *caption, states st, \
1086 == int ch, FILE *d);
1087 == #endif
1088 */
1089 static void
1090 print(m, caption, st, ch, d)
1091 struct match *m;
1092 char *caption;
1093 states st;
1094 int ch;
1095 FILE *d;
1096 {
1097 struct re_guts *g = m->g;
1098 int i;
1099 int first = 1;
1100
1101 _DIAGASSERT(m != NULL);
1102 _DIAGASSERT(caption != NULL);
1103
1104 if (!(m->eflags®_TRACE))
1105 return;
1106
1107 _DIAGASSERT(d != NULL);
1108
1109 fprintf(d, "%s", caption);
1110 if (ch != '\0')
1111 fprintf(d, " %s", pchar(ch));
1112 for (i = 0; i < g->nstates; i++)
1113 if (ISSET(st, i)) {
1114 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1115 first = 0;
1116 }
1117 fprintf(d, "\n");
1118 }
1119
1120 /*
1121 - at - print current situation
1122 == #ifdef REDEBUG
1123 == static void at(struct match *m, char *title, char *start, char *stop, \
1124 == sopno startst, sopno stopst);
1125 == #endif
1126 */
1127 static void
1128 at(m, title, start, stop, startst, stopst)
1129 struct match *m;
1130 char *title;
1131 char *start;
1132 char *stop;
1133 sopno startst;
1134 sopno stopst;
1135 {
1136
1137 _DIAGASSERT(m != NULL);
1138 _DIAGASSERT(title != NULL);
1139 _DIAGASSERT(start != NULL);
1140 _DIAGASSERT(stop != NULL);
1141
1142 if (!(m->eflags®_TRACE))
1143 return;
1144
1145 printf("%s %s-", title, pchar(*start));
1146 printf("%s ", pchar(*stop));
1147 printf("%ld-%ld\n", (long)startst, (long)stopst);
1148 }
1149
1150 #ifndef PCHARDONE
1151 #define PCHARDONE /* never again */
1152 /*
1153 - pchar - make a character printable
1154 == #ifdef REDEBUG
1155 == static char *pchar(int ch);
1156 == #endif
1157 *
1158 * Is this identical to regchar() over in debug.c? Well, yes. But a
1159 * duplicate here avoids having a debugging-capable regexec.o tied to
1160 * a matching debug.o, and this is convenient. It all disappears in
1161 * the non-debug compilation anyway, so it doesn't matter much.
1162 */
1163 static char * /* -> representation */
1164 pchar(ch)
1165 int ch;
1166 {
1167 static char pbuf[10];
1168
1169 if (isprint(ch) || ch == ' ')
1170 (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1171 else
1172 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1173 return(pbuf);
1174 }
1175 #endif
1176 #endif
1177
1178 #undef matcher
1179 #undef fast
1180 #undef slow
1181 #undef dissect
1182 #undef backref
1183 #undef step
1184 #undef print
1185 #undef at
1186 #undef match
1187 #undef nope
1188