engine.c revision 1.17 1 /* $NetBSD: engine.c,v 1.17 2004/03/26 22:42:17 enami Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)engine.c 8.5 (Berkeley) 3/20/94
35 */
36
37 /*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 * @(#)engine.c 8.5 (Berkeley) 3/20/94
72 */
73
74 /*
75 * The matching engine and friends. This file is #included by regexec.c
76 * after suitable #defines of a variety of macros used herein, so that
77 * different state representations can be used without duplicating masses
78 * of code.
79 */
80
81 #ifdef SNAMES
82 #define matcher smatcher
83 #define fast sfast
84 #define slow sslow
85 #define dissect sdissect
86 #define backref sbackref
87 #define step sstep
88 #define print sprint
89 #define at sat
90 #define match smat
91 #define nope snope
92 #endif
93 #ifdef LNAMES
94 #define matcher lmatcher
95 #define fast lfast
96 #define slow lslow
97 #define dissect ldissect
98 #define backref lbackref
99 #define step lstep
100 #define print lprint
101 #define at lat
102 #define match lmat
103 #define nope lnope
104 #endif
105
106 /* another structure passed up and down to avoid zillions of parameters */
107 struct match {
108 struct re_guts *g;
109 int eflags;
110 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
111 char *offp; /* offsets work from here */
112 char *beginp; /* start of string -- virtual NUL precedes */
113 char *endp; /* end of string -- virtual NUL here */
114 char *coldp; /* can be no match starting before here */
115 char **lastpos; /* [nplus+1] */
116 STATEVARS;
117 states st; /* current states */
118 states fresh; /* states for a fresh start */
119 states tmp; /* temporary */
120 states empty; /* empty set of states */
121 };
122
123 /* ========= begin header generated by ./mkh ========= */
124 #ifdef __cplusplus
125 extern "C" {
126 #endif
127
128 /* === engine.c === */
129 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags));
130 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
131 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev));
132 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
133 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst));
134 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft));
135 #define BOL (OUT+1)
136 #define EOL (BOL+1)
137 #define BOLEOL (BOL+2)
138 #define NOTHING (BOL+3)
139 #define BOW (BOL+4)
140 #define EOW (BOL+5)
141 #define CODEMAX (BOL+5) /* highest code used */
142 #define NONCHAR(c) ((c) > CHAR_MAX)
143 #define NNONCHAR (CODEMAX-CHAR_MAX)
144 #ifdef REDEBUG
145 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
146 #endif
147 #ifdef REDEBUG
148 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
149 #endif
150 #ifdef REDEBUG
151 static char *pchar __P((int ch));
152 #endif
153
154 #ifdef __cplusplus
155 }
156 #endif
157 /* ========= end header generated by ./mkh ========= */
158
159 #ifdef REDEBUG
160 #define SP(t, s, c) print(m, t, s, c, stdout)
161 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
162 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
163 static int nope = 0;
164 #else
165 #define SP(t, s, c) /* nothing */
166 #define AT(t, p1, p2, s1, s2) /* nothing */
167 #define NOTE(s) /* nothing */
168 #endif
169
170 /*
171 - matcher - the actual matching engine
172 == static int matcher(struct re_guts *g, char *string, \
173 == size_t nmatch, regmatch_t pmatch[], int eflags);
174 */
175 static int /* 0 success, REG_NOMATCH failure */
176 matcher(g, string, nmatch, pmatch, eflags)
177 struct re_guts *g;
178 char *string;
179 size_t nmatch;
180 regmatch_t pmatch[];
181 int eflags;
182 {
183 char *endp;
184 int i;
185 struct match mv;
186 struct match *m = &mv;
187 char *dp;
188 const sopno gf = g->firststate+1; /* +1 for OEND */
189 const sopno gl = g->laststate;
190 char *start;
191 char *stop;
192 int error = 0;
193
194 _DIAGASSERT(g != NULL);
195 _DIAGASSERT(string != NULL);
196 /* pmatch checked below */
197
198 /* simplify the situation where possible */
199 if (g->cflags®_NOSUB)
200 nmatch = 0;
201 if (eflags®_STARTEND) {
202 _DIAGASSERT(pmatch != NULL);
203 start = string + (size_t)pmatch[0].rm_so;
204 stop = string + (size_t)pmatch[0].rm_eo;
205 } else {
206 start = string;
207 stop = start + strlen(start);
208 }
209 if (stop < start)
210 return(REG_INVARG);
211
212 /* prescreening; this does wonders for this rather slow code */
213 if (g->must != NULL) {
214 for (dp = start; dp < stop; dp++)
215 if (*dp == g->must[0] && stop - dp >= g->mlen &&
216 memcmp(dp, g->must, (size_t)g->mlen) == 0)
217 break;
218 if (dp == stop) /* we didn't find g->must */
219 return(REG_NOMATCH);
220 }
221
222 /* match struct setup */
223 m->g = g;
224 m->eflags = eflags;
225 m->pmatch = NULL;
226 m->lastpos = NULL;
227 m->offp = string;
228 m->beginp = start;
229 m->endp = stop;
230 STATESETUP(m, 4);
231 SETUP(m->st);
232 SETUP(m->fresh);
233 SETUP(m->tmp);
234 SETUP(m->empty);
235 CLEAR(m->empty);
236
237 /* this loop does only one repetition except for backrefs */
238 for (;;) {
239 endp = fast(m, start, stop, gf, gl);
240 if (endp == NULL) { /* a miss */
241 error = REG_NOMATCH;
242 goto done;
243 }
244 if (nmatch == 0 && !g->backrefs)
245 break; /* no further info needed */
246
247 /* where? */
248 assert(m->coldp != NULL);
249 for (;;) {
250 NOTE("finding start");
251 endp = slow(m, m->coldp, stop, gf, gl);
252 if (endp != NULL)
253 break;
254 assert(m->coldp < m->endp);
255 m->coldp++;
256 }
257 if (nmatch == 1 && !g->backrefs)
258 break; /* no further info needed */
259
260 /* oh my, he wants the subexpressions... */
261 if (m->pmatch == NULL)
262 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
263 sizeof(regmatch_t));
264 if (m->pmatch == NULL) {
265 error = REG_ESPACE;
266 goto done;
267 }
268 for (i = 1; i <= m->g->nsub; i++)
269 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
270 if (!g->backrefs && !(m->eflags®_BACKR)) {
271 NOTE("dissecting");
272 dp = dissect(m, m->coldp, endp, gf, gl);
273 } else {
274 if (g->nplus > 0 && m->lastpos == NULL)
275 m->lastpos = (char **)malloc((g->nplus+1) *
276 sizeof(char *));
277 if (g->nplus > 0 && m->lastpos == NULL) {
278 error = REG_ESPACE;
279 goto done;
280 }
281 NOTE("backref dissect");
282 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
283 }
284 if (dp != NULL)
285 break;
286
287 /* uh-oh... we couldn't find a subexpression-level match */
288 assert(g->backrefs); /* must be back references doing it */
289 assert(g->nplus == 0 || m->lastpos != NULL);
290 for (;;) {
291 if (dp != NULL || endp <= m->coldp)
292 break; /* defeat */
293 NOTE("backoff");
294 endp = slow(m, m->coldp, endp-1, gf, gl);
295 if (endp == NULL)
296 break; /* defeat */
297 /* try it on a shorter possibility */
298 #ifndef NDEBUG
299 for (i = 1; i <= m->g->nsub; i++) {
300 assert(m->pmatch[i].rm_so == (regoff_t)-1);
301 assert(m->pmatch[i].rm_eo == (regoff_t)-1);
302 }
303 #endif
304 NOTE("backoff dissect");
305 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
306 }
307 assert(dp == NULL || dp == endp);
308 if (dp != NULL) /* found a shorter one */
309 break;
310
311 /* despite initial appearances, there is no match here */
312 NOTE("false alarm");
313 start = m->coldp + 1; /* recycle starting later */
314 assert(start <= stop);
315 }
316
317 /* fill in the details if requested */
318 if (nmatch > 0) {
319 _DIAGASSERT(pmatch != NULL);
320 pmatch[0].rm_so = m->coldp - m->offp;
321 pmatch[0].rm_eo = endp - m->offp;
322 }
323 if (nmatch > 1) {
324 assert(m->pmatch != NULL);
325 for (i = 1; i < nmatch; i++)
326 if (i <= m->g->nsub)
327 pmatch[i] = m->pmatch[i];
328 else {
329 pmatch[i].rm_so = (regoff_t)-1;
330 pmatch[i].rm_eo = (regoff_t)-1;
331 }
332 }
333
334 done:
335 if (m->pmatch != NULL) {
336 free(m->pmatch);
337 m->pmatch = NULL;
338 }
339 if (m->lastpos != NULL) {
340 free(m->lastpos);
341 m->lastpos = NULL;
342 }
343 STATETEARDOWN(m);
344 return error;
345 }
346
347 /*
348 - dissect - figure out what matched what, no back references
349 == static char *dissect(struct match *m, char *start, \
350 == char *stop, sopno startst, sopno stopst);
351 */
352 static char * /* == stop (success) always */
353 dissect(m, start, stop, startst, stopst)
354 struct match *m;
355 char *start;
356 char *stop;
357 sopno startst;
358 sopno stopst;
359 {
360 int i;
361 sopno ss; /* start sop of current subRE */
362 sopno es; /* end sop of current subRE */
363 char *sp; /* start of string matched by it */
364 char *stp; /* string matched by it cannot pass here */
365 char *rest; /* start of rest of string */
366 char *tail; /* string unmatched by rest of RE */
367 sopno ssub; /* start sop of subsubRE */
368 sopno esub; /* end sop of subsubRE */
369 char *ssp; /* start of string matched by subsubRE */
370 char *sep; /* end of string matched by subsubRE */
371 char *oldssp; /* previous ssp */
372 #ifndef NDEBUG
373 char *dp;
374 #endif
375
376 _DIAGASSERT(m != NULL);
377 _DIAGASSERT(start != NULL);
378 _DIAGASSERT(stop != NULL);
379
380 AT("diss", start, stop, startst, stopst);
381 sp = start;
382 for (ss = startst; ss < stopst; ss = es) {
383 /* identify end of subRE */
384 es = ss;
385 switch (OP(m->g->strip[es])) {
386 case OPLUS_:
387 case OQUEST_:
388 es += OPND(m->g->strip[es]);
389 break;
390 case OCH_:
391 while (OP(m->g->strip[es]) != O_CH)
392 es += OPND(m->g->strip[es]);
393 break;
394 }
395 es++;
396
397 /* figure out what it matched */
398 switch (OP(m->g->strip[ss])) {
399 case OEND:
400 assert(nope);
401 break;
402 case OCHAR:
403 sp++;
404 break;
405 case OBOL:
406 case OEOL:
407 case OBOW:
408 case OEOW:
409 break;
410 case OANY:
411 case OANYOF:
412 sp++;
413 break;
414 case OBACK_:
415 case O_BACK:
416 assert(nope);
417 break;
418 /* cases where length of match is hard to find */
419 case OQUEST_:
420 stp = stop;
421 for (;;) {
422 /* how long could this one be? */
423 rest = slow(m, sp, stp, ss, es);
424 assert(rest != NULL); /* it did match */
425 /* could the rest match the rest? */
426 tail = slow(m, rest, stop, es, stopst);
427 if (tail == stop)
428 break; /* yes! */
429 /* no -- try a shorter match for this one */
430 stp = rest - 1;
431 assert(stp >= sp); /* it did work */
432 }
433 ssub = ss + 1;
434 esub = es - 1;
435 /* did innards match? */
436 if (slow(m, sp, rest, ssub, esub) != NULL) {
437 #ifdef NDEBUG
438 (void)
439 #else
440 dp =
441 #endif
442 dissect(m, sp, rest, ssub, esub);
443 assert(dp == rest);
444 } else /* no */
445 assert(sp == rest);
446 sp = rest;
447 break;
448 case OPLUS_:
449 stp = stop;
450 for (;;) {
451 /* how long could this one be? */
452 rest = slow(m, sp, stp, ss, es);
453 assert(rest != NULL); /* it did match */
454 /* could the rest match the rest? */
455 tail = slow(m, rest, stop, es, stopst);
456 if (tail == stop)
457 break; /* yes! */
458 /* no -- try a shorter match for this one */
459 stp = rest - 1;
460 assert(stp >= sp); /* it did work */
461 }
462 ssub = ss + 1;
463 esub = es - 1;
464 ssp = sp;
465 oldssp = ssp;
466 for (;;) { /* find last match of innards */
467 sep = slow(m, ssp, rest, ssub, esub);
468 if (sep == NULL || sep == ssp)
469 break; /* failed or matched null */
470 oldssp = ssp; /* on to next try */
471 ssp = sep;
472 }
473 if (sep == NULL) {
474 /* last successful match */
475 sep = ssp;
476 ssp = oldssp;
477 }
478 assert(sep == rest); /* must exhaust substring */
479 assert(slow(m, ssp, sep, ssub, esub) == rest);
480 #ifdef NDEBUG
481 (void)
482 #else
483 dp =
484 #endif
485 dissect(m, ssp, sep, ssub, esub);
486 assert(dp == sep);
487 sp = rest;
488 break;
489 case OCH_:
490 stp = stop;
491 for (;;) {
492 /* how long could this one be? */
493 rest = slow(m, sp, stp, ss, es);
494 assert(rest != NULL); /* it did match */
495 /* could the rest match the rest? */
496 tail = slow(m, rest, stop, es, stopst);
497 if (tail == stop)
498 break; /* yes! */
499 /* no -- try a shorter match for this one */
500 stp = rest - 1;
501 assert(stp >= sp); /* it did work */
502 }
503 ssub = ss + 1;
504 esub = ss + OPND(m->g->strip[ss]) - 1;
505 assert(OP(m->g->strip[esub]) == OOR1);
506 for (;;) { /* find first matching branch */
507 if (slow(m, sp, rest, ssub, esub) == rest)
508 break; /* it matched all of it */
509 /* that one missed, try next one */
510 assert(OP(m->g->strip[esub]) == OOR1);
511 esub++;
512 assert(OP(m->g->strip[esub]) == OOR2);
513 ssub = esub + 1;
514 esub += OPND(m->g->strip[esub]);
515 if (OP(m->g->strip[esub]) == OOR2)
516 esub--;
517 else
518 assert(OP(m->g->strip[esub]) == O_CH);
519 }
520 #ifdef NDEBUG
521 (void)
522 #else
523 dp =
524 #endif
525 dissect(m, sp, rest, ssub, esub);
526 assert(dp == rest);
527 sp = rest;
528 break;
529 case O_PLUS:
530 case O_QUEST:
531 case OOR1:
532 case OOR2:
533 case O_CH:
534 assert(nope);
535 break;
536 case OLPAREN:
537 i = OPND(m->g->strip[ss]);
538 assert(0 < i && i <= m->g->nsub);
539 m->pmatch[i].rm_so = sp - m->offp;
540 break;
541 case ORPAREN:
542 i = OPND(m->g->strip[ss]);
543 assert(0 < i && i <= m->g->nsub);
544 m->pmatch[i].rm_eo = sp - m->offp;
545 break;
546 default: /* uh oh */
547 assert(nope);
548 break;
549 }
550 }
551
552 assert(sp == stop);
553 return(sp);
554 }
555
556 /*
557 - backref - figure out what matched what, figuring in back references
558 == static char *backref(struct match *m, char *start, \
559 == char *stop, sopno startst, sopno stopst, sopno lev);
560 */
561 static char * /* == stop (success) or NULL (failure) */
562 backref(m, start, stop, startst, stopst, lev)
563 struct match *m;
564 char *start;
565 char *stop;
566 sopno startst;
567 sopno stopst;
568 sopno lev; /* PLUS nesting level */
569 {
570 int i;
571 sopno ss; /* start sop of current subRE */
572 char *sp; /* start of string matched by it */
573 sopno ssub; /* start sop of subsubRE */
574 sopno esub; /* end sop of subsubRE */
575 char *ssp; /* start of string matched by subsubRE */
576 char *dp;
577 size_t len;
578 int hard;
579 sop s;
580 regoff_t offsave;
581 cset *cs;
582
583 _DIAGASSERT(m != NULL);
584 _DIAGASSERT(start != NULL);
585 _DIAGASSERT(stop != NULL);
586
587 AT("back", start, stop, startst, stopst);
588 sp = start;
589
590 /* get as far as we can with easy stuff */
591 hard = 0;
592 for (ss = startst; !hard && ss < stopst; ss++)
593 switch (OP(s = m->g->strip[ss])) {
594 case OCHAR:
595 if (sp == stop || *sp++ != (char)OPND(s))
596 return(NULL);
597 break;
598 case OANY:
599 if (sp == stop)
600 return(NULL);
601 sp++;
602 break;
603 case OANYOF:
604 cs = &m->g->sets[OPND(s)];
605 if (sp == stop || !CHIN(cs, *sp++))
606 return(NULL);
607 break;
608 case OBOL:
609 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
610 (sp < m->endp && *(sp-1) == '\n' &&
611 (m->g->cflags®_NEWLINE)) )
612 { /* yes */ }
613 else
614 return(NULL);
615 break;
616 case OEOL:
617 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
618 (sp < m->endp && *sp == '\n' &&
619 (m->g->cflags®_NEWLINE)) )
620 { /* yes */ }
621 else
622 return(NULL);
623 break;
624 case OBOW:
625 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
626 (sp < m->endp && *(sp-1) == '\n' &&
627 (m->g->cflags®_NEWLINE)) ||
628 (sp > m->beginp &&
629 !ISWORD(*(sp-1))) ) &&
630 (sp < m->endp && ISWORD(*sp)) )
631 { /* yes */ }
632 else
633 return(NULL);
634 break;
635 case OEOW:
636 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
637 (sp < m->endp && *sp == '\n' &&
638 (m->g->cflags®_NEWLINE)) ||
639 (sp < m->endp && !ISWORD(*sp)) ) &&
640 (sp > m->beginp && ISWORD(*(sp-1))) )
641 { /* yes */ }
642 else
643 return(NULL);
644 break;
645 case O_QUEST:
646 break;
647 case OOR1: /* matches null but needs to skip */
648 ss++;
649 s = m->g->strip[ss];
650 do {
651 assert(OP(s) == OOR2);
652 ss += OPND(s);
653 } while (OP(s = m->g->strip[ss]) != O_CH);
654 /* note that the ss++ gets us past the O_CH */
655 break;
656 default: /* have to make a choice */
657 hard = 1;
658 break;
659 }
660 if (!hard) { /* that was it! */
661 if (sp != stop)
662 return(NULL);
663 return(sp);
664 }
665 ss--; /* adjust for the for's final increment */
666
667 /* the hard stuff */
668 AT("hard", sp, stop, ss, stopst);
669 s = m->g->strip[ss];
670 switch (OP(s)) {
671 case OBACK_: /* the vilest depths */
672 i = OPND(s);
673 assert(0 < i && i <= m->g->nsub);
674 if (m->pmatch[i].rm_eo == (regoff_t)-1)
675 return(NULL);
676 assert(m->pmatch[i].rm_so != (regoff_t)-1);
677 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
678 assert(stop - m->beginp >= len);
679 if (sp > stop - len)
680 return(NULL); /* not enough left to match */
681 ssp = m->offp + (size_t)m->pmatch[i].rm_so;
682 if (memcmp(sp, ssp, len) != 0)
683 return(NULL);
684 while (m->g->strip[ss] != SOP(O_BACK, i))
685 ss++;
686 return(backref(m, sp+len, stop, ss+1, stopst, lev));
687
688 case OQUEST_: /* to null or not */
689 dp = backref(m, sp, stop, ss+1, stopst, lev);
690 if (dp != NULL)
691 return(dp); /* not */
692 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
693
694 case OPLUS_:
695 assert(m->lastpos != NULL);
696 assert(lev+1 <= m->g->nplus);
697 m->lastpos[lev+1] = sp;
698 return(backref(m, sp, stop, ss+1, stopst, lev+1));
699
700 case O_PLUS:
701 if (sp == m->lastpos[lev]) /* last pass matched null */
702 return(backref(m, sp, stop, ss+1, stopst, lev-1));
703 /* try another pass */
704 m->lastpos[lev] = sp;
705 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
706 if (dp == NULL)
707 dp = backref(m, sp, stop, ss+1, stopst, lev-1);
708 return(dp);
709
710 case OCH_: /* find the right one, if any */
711 ssub = ss + 1;
712 esub = ss + OPND(s) - 1;
713 assert(OP(m->g->strip[esub]) == OOR1);
714 for (;;) { /* find first matching branch */
715 dp = backref(m, sp, stop, ssub, esub, lev);
716 if (dp != NULL)
717 return(dp);
718 /* that one missed, try next one */
719 if (OP(m->g->strip[esub]) == O_CH)
720 return(NULL); /* there is none */
721 esub++;
722 assert(OP(m->g->strip[esub]) == OOR2);
723 ssub = esub + 1;
724 esub += OPND(m->g->strip[esub]);
725 if (OP(m->g->strip[esub]) == OOR2)
726 esub--;
727 else
728 assert(OP(m->g->strip[esub]) == O_CH);
729 }
730
731 case OLPAREN: /* must undo assignment if rest fails */
732 i = OPND(s);
733 assert(0 < i && i <= m->g->nsub);
734 offsave = m->pmatch[i].rm_so;
735 m->pmatch[i].rm_so = sp - m->offp;
736 dp = backref(m, sp, stop, ss+1, stopst, lev);
737 if (dp != NULL)
738 return(dp);
739 m->pmatch[i].rm_so = offsave;
740 return(NULL);
741
742 case ORPAREN: /* must undo assignment if rest fails */
743 i = OPND(s);
744 assert(0 < i && i <= m->g->nsub);
745 offsave = m->pmatch[i].rm_eo;
746 m->pmatch[i].rm_eo = sp - m->offp;
747 dp = backref(m, sp, stop, ss+1, stopst, lev);
748 if (dp != NULL)
749 return(dp);
750 m->pmatch[i].rm_eo = offsave;
751 return(NULL);
752
753 default: /* uh oh */
754 assert(nope);
755 break;
756 }
757
758 /* "can't happen" */
759 assert(nope);
760 /* NOTREACHED */
761 return NULL;
762 }
763
764 /*
765 - fast - step through the string at top speed
766 == static char *fast(struct match *m, char *start, \
767 == char *stop, sopno startst, sopno stopst);
768 */
769 static char * /* where tentative match ended, or NULL */
770 fast(m, start, stop, startst, stopst)
771 struct match *m;
772 char *start;
773 char *stop;
774 sopno startst;
775 sopno stopst;
776 {
777 states st = m->st;
778 states fresh = m->fresh;
779 states tmp = m->tmp;
780 char *p = start;
781 int c = (start == m->beginp) ? OUT : *(start-1);
782 int lastc; /* previous c */
783 int flagch;
784 int i;
785 char *coldp; /* last p after which no match was underway */
786
787 _DIAGASSERT(m != NULL);
788 _DIAGASSERT(start != NULL);
789 _DIAGASSERT(stop != NULL);
790
791 CLEAR(st);
792 SET1(st, startst);
793 st = step(m->g, startst, stopst, st, NOTHING, st);
794 ASSIGN(fresh, st);
795 SP("start", st, *p);
796 coldp = NULL;
797 for (;;) {
798 /* next character */
799 lastc = c;
800 c = (p == m->endp) ? OUT : *p;
801 if (EQ(st, fresh))
802 coldp = p;
803
804 /* is there an EOL and/or BOL between lastc and c? */
805 flagch = '\0';
806 i = 0;
807 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
808 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
809 flagch = BOL;
810 i = m->g->nbol;
811 }
812 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
813 (c == OUT && !(m->eflags®_NOTEOL)) ) {
814 flagch = (flagch == BOL) ? BOLEOL : EOL;
815 i += m->g->neol;
816 }
817 if (i != 0) {
818 for (; i > 0; i--)
819 st = step(m->g, startst, stopst, st, flagch, st);
820 SP("boleol", st, c);
821 }
822
823 /* how about a word boundary? */
824 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
825 (c != OUT && ISWORD(c)) ) {
826 flagch = BOW;
827 }
828 if ( (lastc != OUT && ISWORD(lastc)) &&
829 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
830 flagch = EOW;
831 }
832 if (flagch == BOW || flagch == EOW) {
833 st = step(m->g, startst, stopst, st, flagch, st);
834 SP("boweow", st, c);
835 }
836
837 /* are we done? */
838 if (ISSET(st, stopst) || p == stop)
839 break; /* NOTE BREAK OUT */
840
841 /* no, we must deal with this character */
842 ASSIGN(tmp, st);
843 ASSIGN(st, fresh);
844 assert(c != OUT);
845 st = step(m->g, startst, stopst, tmp, c, st);
846 SP("aft", st, c);
847 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
848 p++;
849 }
850
851 assert(coldp != NULL);
852 m->coldp = coldp;
853 if (ISSET(st, stopst))
854 return(p+1);
855 else
856 return(NULL);
857 }
858
859 /*
860 - slow - step through the string more deliberately
861 == static char *slow(struct match *m, char *start, \
862 == char *stop, sopno startst, sopno stopst);
863 */
864 static char * /* where it ended */
865 slow(m, start, stop, startst, stopst)
866 struct match *m;
867 char *start;
868 char *stop;
869 sopno startst;
870 sopno stopst;
871 {
872 states st = m->st;
873 states empty = m->empty;
874 states tmp = m->tmp;
875 char *p = start;
876 int c = (start == m->beginp) ? OUT : *(start-1);
877 int lastc; /* previous c */
878 int flagch;
879 int i;
880 char *matchp; /* last p at which a match ended */
881
882 _DIAGASSERT(m != NULL);
883 _DIAGASSERT(start != NULL);
884 _DIAGASSERT(stop != NULL);
885
886 AT("slow", start, stop, startst, stopst);
887 CLEAR(st);
888 SET1(st, startst);
889 SP("sstart", st, *p);
890 st = step(m->g, startst, stopst, st, NOTHING, st);
891 matchp = NULL;
892 for (;;) {
893 /* next character */
894 lastc = c;
895 c = (p == m->endp) ? OUT : *p;
896
897 /* is there an EOL and/or BOL between lastc and c? */
898 flagch = '\0';
899 i = 0;
900 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
901 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
902 flagch = BOL;
903 i = m->g->nbol;
904 }
905 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
906 (c == OUT && !(m->eflags®_NOTEOL)) ) {
907 flagch = (flagch == BOL) ? BOLEOL : EOL;
908 i += m->g->neol;
909 }
910 if (i != 0) {
911 for (; i > 0; i--)
912 st = step(m->g, startst, stopst, st, flagch, st);
913 SP("sboleol", st, c);
914 }
915
916 /* how about a word boundary? */
917 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
918 (c != OUT && ISWORD(c)) ) {
919 flagch = BOW;
920 }
921 if ( (lastc != OUT && ISWORD(lastc)) &&
922 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
923 flagch = EOW;
924 }
925 if (flagch == BOW || flagch == EOW) {
926 st = step(m->g, startst, stopst, st, flagch, st);
927 SP("sboweow", st, c);
928 }
929
930 /* are we done? */
931 if (ISSET(st, stopst))
932 matchp = p;
933 if (EQ(st, empty) || p == stop)
934 break; /* NOTE BREAK OUT */
935
936 /* no, we must deal with this character */
937 ASSIGN(tmp, st);
938 ASSIGN(st, empty);
939 assert(c != OUT);
940 st = step(m->g, startst, stopst, tmp, c, st);
941 SP("saft", st, c);
942 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
943 p++;
944 }
945
946 return(matchp);
947 }
948
949
950 /*
951 - step - map set of states reachable before char to set reachable after
952 == static states step(struct re_guts *g, sopno start, sopno stop, \
953 == states bef, int ch, states aft);
954 == #define BOL (OUT+1)
955 == #define EOL (BOL+1)
956 == #define BOLEOL (BOL+2)
957 == #define NOTHING (BOL+3)
958 == #define BOW (BOL+4)
959 == #define EOW (BOL+5)
960 == #define CODEMAX (BOL+5) // highest code used
961 == #define NONCHAR(c) ((c) > CHAR_MAX)
962 == #define NNONCHAR (CODEMAX-CHAR_MAX)
963 */
964 static states
965 step(g, start, stop, bef, ch, aft)
966 struct re_guts *g;
967 sopno start; /* start state within strip */
968 sopno stop; /* state after stop state within strip */
969 states bef; /* states reachable before */
970 int ch; /* character or NONCHAR code */
971 states aft; /* states already known reachable after */
972 {
973 cset *cs;
974 sop s;
975 sopno pc;
976 onestate here; /* note, macros know this name */
977 sopno look;
978 int i;
979
980 _DIAGASSERT(g != NULL);
981
982 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
983 s = g->strip[pc];
984 switch (OP(s)) {
985 case OEND:
986 assert(pc == stop-1);
987 break;
988 case OCHAR:
989 /* only characters can match */
990 assert(!NONCHAR(ch) || ch != (char)OPND(s));
991 if (ch == (char)OPND(s))
992 FWD(aft, bef, 1);
993 break;
994 case OBOL:
995 if (ch == BOL || ch == BOLEOL)
996 FWD(aft, bef, 1);
997 break;
998 case OEOL:
999 if (ch == EOL || ch == BOLEOL)
1000 FWD(aft, bef, 1);
1001 break;
1002 case OBOW:
1003 if (ch == BOW)
1004 FWD(aft, bef, 1);
1005 break;
1006 case OEOW:
1007 if (ch == EOW)
1008 FWD(aft, bef, 1);
1009 break;
1010 case OANY:
1011 if (!NONCHAR(ch))
1012 FWD(aft, bef, 1);
1013 break;
1014 case OANYOF:
1015 cs = &g->sets[OPND(s)];
1016 if (!NONCHAR(ch) && CHIN(cs, ch))
1017 FWD(aft, bef, 1);
1018 break;
1019 case OBACK_: /* ignored here */
1020 case O_BACK:
1021 FWD(aft, aft, 1);
1022 break;
1023 case OPLUS_: /* forward, this is just an empty */
1024 FWD(aft, aft, 1);
1025 break;
1026 case O_PLUS: /* both forward and back */
1027 FWD(aft, aft, 1);
1028 i = ISSETBACK(aft, OPND(s));
1029 BACK(aft, aft, OPND(s));
1030 if (!i && ISSETBACK(aft, OPND(s))) {
1031 /* oho, must reconsider loop body */
1032 pc -= OPND(s) + 1;
1033 INIT(here, pc);
1034 }
1035 break;
1036 case OQUEST_: /* two branches, both forward */
1037 FWD(aft, aft, 1);
1038 FWD(aft, aft, OPND(s));
1039 break;
1040 case O_QUEST: /* just an empty */
1041 FWD(aft, aft, 1);
1042 break;
1043 case OLPAREN: /* not significant here */
1044 case ORPAREN:
1045 FWD(aft, aft, 1);
1046 break;
1047 case OCH_: /* mark the first two branches */
1048 FWD(aft, aft, 1);
1049 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1050 FWD(aft, aft, OPND(s));
1051 break;
1052 case OOR1: /* done a branch, find the O_CH */
1053 if (ISSTATEIN(aft, here)) {
1054 for (look = 1;
1055 OP(s = g->strip[pc+look]) != O_CH;
1056 look += OPND(s))
1057 assert(OP(s) == OOR2);
1058 FWD(aft, aft, look);
1059 }
1060 break;
1061 case OOR2: /* propagate OCH_'s marking */
1062 FWD(aft, aft, 1);
1063 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1064 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1065 FWD(aft, aft, OPND(s));
1066 }
1067 break;
1068 case O_CH: /* just empty */
1069 FWD(aft, aft, 1);
1070 break;
1071 default: /* ooooops... */
1072 assert(nope);
1073 break;
1074 }
1075 }
1076
1077 return(aft);
1078 }
1079
1080 #ifdef REDEBUG
1081 /*
1082 - print - print a set of states
1083 == #ifdef REDEBUG
1084 == static void print(struct match *m, char *caption, states st, \
1085 == int ch, FILE *d);
1086 == #endif
1087 */
1088 static void
1089 print(m, caption, st, ch, d)
1090 struct match *m;
1091 char *caption;
1092 states st;
1093 int ch;
1094 FILE *d;
1095 {
1096 struct re_guts *g = m->g;
1097 int i;
1098 int first = 1;
1099
1100 _DIAGASSERT(m != NULL);
1101 _DIAGASSERT(caption != NULL);
1102
1103 if (!(m->eflags®_TRACE))
1104 return;
1105
1106 _DIAGASSERT(d != NULL);
1107
1108 fprintf(d, "%s", caption);
1109 if (ch != '\0')
1110 fprintf(d, " %s", pchar(ch));
1111 for (i = 0; i < g->nstates; i++)
1112 if (ISSET(st, i)) {
1113 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1114 first = 0;
1115 }
1116 fprintf(d, "\n");
1117 }
1118
1119 /*
1120 - at - print current situation
1121 == #ifdef REDEBUG
1122 == static void at(struct match *m, char *title, char *start, char *stop, \
1123 == sopno startst, sopno stopst);
1124 == #endif
1125 */
1126 static void
1127 at(m, title, start, stop, startst, stopst)
1128 struct match *m;
1129 char *title;
1130 char *start;
1131 char *stop;
1132 sopno startst;
1133 sopno stopst;
1134 {
1135
1136 _DIAGASSERT(m != NULL);
1137 _DIAGASSERT(title != NULL);
1138 _DIAGASSERT(start != NULL);
1139 _DIAGASSERT(stop != NULL);
1140
1141 if (!(m->eflags®_TRACE))
1142 return;
1143
1144 printf("%s %s-", title, pchar(*start));
1145 printf("%s ", pchar(*stop));
1146 printf("%ld-%ld\n", (long)startst, (long)stopst);
1147 }
1148
1149 #ifndef PCHARDONE
1150 #define PCHARDONE /* never again */
1151 /*
1152 - pchar - make a character printable
1153 == #ifdef REDEBUG
1154 == static char *pchar(int ch);
1155 == #endif
1156 *
1157 * Is this identical to regchar() over in debug.c? Well, yes. But a
1158 * duplicate here avoids having a debugging-capable regexec.o tied to
1159 * a matching debug.o, and this is convenient. It all disappears in
1160 * the non-debug compilation anyway, so it doesn't matter much.
1161 */
1162 static char * /* -> representation */
1163 pchar(ch)
1164 int ch;
1165 {
1166 static char pbuf[10];
1167
1168 if (isprint(ch) || ch == ' ')
1169 (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1170 else
1171 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1172 return(pbuf);
1173 }
1174 #endif
1175 #endif
1176
1177 #undef matcher
1178 #undef fast
1179 #undef slow
1180 #undef dissect
1181 #undef backref
1182 #undef step
1183 #undef print
1184 #undef at
1185 #undef match
1186 #undef nope
1187