regcomp.c revision 1.43 1 /* $NetBSD: regcomp.c,v 1.43 2021/02/25 21:47:46 christos Exp $ */
2
3 /*-
4 * SPDX-License-Identifier: BSD-3-Clause
5 *
6 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 * Copyright (c) 1992, 1993, 1994
8 * The Regents of the University of California. All rights reserved.
9 *
10 * Copyright (c) 2011 The FreeBSD Foundation
11 * All rights reserved.
12 * Portions of this software were developed by David Chisnall
13 * under sponsorship from the FreeBSD Foundation.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * Henry Spencer.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 */
44
45 #include <sys/cdefs.h>
46 #if 0
47 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
48 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
49 #endif
50 __RCSID("$NetBSD: regcomp.c,v 1.43 2021/02/25 21:47:46 christos Exp $");
51
52 #define _OPENBSD_SOURCE
53
54 #ifndef LIBHACK
55 #define REGEX_GNU_EXTENSIONS
56
57 #include "namespace.h"
58 #endif
59 #include <sys/types.h>
60 #include <stdio.h>
61 #include <string.h>
62 #include <ctype.h>
63 #include <limits.h>
64 #include <stdlib.h>
65 #include <regex.h>
66 #include <stdbool.h>
67
68 #if defined(_weak_alias) && !defined(LIBHACK)
69 __weak_alias(regcomp,_regcomp)
70 #endif
71
72 #ifdef REGEX_LIBC_COLLATE
73 #include "collate.h"
74 #endif
75
76 #include "utils.h"
77 #include "regex2.h"
78
79 #include "cname.h"
80
81 /*
82 * Branching context, used to keep track of branch state for all of the branch-
83 * aware functions. In addition to keeping track of branch positions for the
84 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
85 * detection of whether ^ is acting as an anchor or being used erroneously and
86 * also for whether we're in a sub-expression or not.
87 */
88 struct branchc {
89 sopno start;
90 sopno back;
91 sopno fwd;
92
93 int nbranch;
94 int nchain;
95 bool outer;
96 bool terminate;
97 };
98
99 /*
100 * parse structure, passed up and down to avoid global variables and
101 * other clumsinesses
102 */
103 struct parse {
104 const char *next; /* next character in RE */
105 const char *end; /* end of string (-> NUL normally) */
106 int error; /* has an error been seen? */
107 int gnuext;
108 sop *strip; /* malloced strip */
109 sopno ssize; /* malloced strip size (allocated) */
110 sopno slen; /* malloced strip length (used) */
111 size_t ncsalloc; /* number of csets allocated */
112 struct re_guts *g;
113 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
114 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
115 sopno pend[NPAREN]; /* -> ) ([0] unused) */
116 bool allowbranch; /* can this expression branch? */
117 bool bre; /* convenience; is this a BRE? */
118 int pflags; /* other parsing flags -- legacy escapes? */
119 bool (*parse_expr)(struct parse *, struct branchc *);
120 void (*pre_parse)(struct parse *, struct branchc *);
121 void (*post_parse)(struct parse *, struct branchc *);
122 };
123
124 #define PFLAG_LEGACY_ESC 0x00000001
125
126 /* ========= begin header generated by ./mkh ========= */
127 #ifdef __cplusplus
128 extern "C" {
129 #endif
130
131 /* === regcomp.c === */
132 static bool p_ere_exp(struct parse *p, struct branchc *bc);
133 static void p_str(struct parse *p);
134 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
135 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
136 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
137 static bool p_branch_empty(struct parse *p, struct branchc *bc);
138 static bool p_branch_do(struct parse *p, struct branchc *bc);
139 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
140 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
141 static void p_re(struct parse *p, int end1, int end2);
142 static bool p_simp_re(struct parse *p, struct branchc *bc);
143 static int p_count(struct parse *p);
144 static void p_bracket(struct parse *p);
145 static int p_range_cmp(wchar_t c1, wchar_t c2);
146 static void p_b_term(struct parse *p, cset *cs);
147 #ifdef REGEX_GNU_EXTENSIONS
148 static int p_b_pseudoclass(struct parse *p, char c);
149 #endif
150 static void p_b_cclass(struct parse *p, cset *cs);
151 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
152 static void p_b_eclass(struct parse *p, cset *cs);
153 static wint_t p_b_symbol(struct parse *p);
154 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
155 static bool may_escape(struct parse *p, const wint_t ch);
156 static wint_t othercase(wint_t ch);
157 static void bothcases(struct parse *p, wint_t ch);
158 static void ordinary(struct parse *p, wint_t ch);
159 static void nonnewline(struct parse *p);
160 static void repeat(struct parse *p, sopno start, int from, int to);
161 static int seterr(struct parse *p, int e);
162 static cset *allocset(struct parse *p);
163 static void freeset(struct parse *p, cset *cs);
164 static void CHadd(struct parse *p, cset *cs, wint_t ch);
165 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
166 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
167 static wint_t singleton(cset *cs);
168 static sopno dupl(struct parse *p, sopno start, sopno finish);
169 static void doemit(struct parse *p, sop op, size_t opnd);
170 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
171 static void dofwd(struct parse *p, sopno pos, sop value);
172 static int enlarge(struct parse *p, sopno size);
173 static void stripsnug(struct parse *p, struct re_guts *g);
174 static void findmust(struct parse *p, struct re_guts *g);
175 static int altoffset(sop *scan, int offset);
176 static void computejumps(struct parse *p, struct re_guts *g);
177 static void computematchjumps(struct parse *p, struct re_guts *g);
178 static sopno pluscount(struct parse *p, struct re_guts *g);
179 static wint_t wgetnext(struct parse *p);
180
181 #ifdef __cplusplus
182 }
183 #endif
184 /* ========= end header generated by ./mkh ========= */
185
186 static char nuls[10]; /* place to point scanner in event of error */
187
188 /*
189 * macros for use with parse structure
190 * BEWARE: these know that the parse structure is named `p' !!!
191 */
192 #define PEEK() (*p->next)
193 #define PEEK2() (*(p->next+1))
194 #define MORE() (p->next < p->end)
195 #define MORE2() (p->next+1 < p->end)
196 #define SEE(c) (MORE() && PEEK() == (c))
197 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
198 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
199 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
200 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
201 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
202 #define NEXT() (p->next++)
203 #define NEXT2() (p->next += 2)
204 #define NEXTn(n) (p->next += (n))
205 #define GETNEXT() (*p->next++)
206 #define WGETNEXT() wgetnext(p)
207 #define SETERROR(e) seterr(p, (e))
208 #define REQUIRE(co, e) ((co) || SETERROR(e))
209 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
210 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
211 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
212 #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
213 #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
214 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
215 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
216 #define HERE() (p->slen)
217 #define THERE() (p->slen - 1)
218 #define THERETHERE() (p->slen - 2)
219 #define DROP(n) (p->slen -= (n))
220
221 /* Macro used by computejump()/computematchjump() */
222 #ifndef MIN
223 #define MIN(a,b) ((a)<(b)?(a):(b))
224 #endif
225
226 #ifndef NLS
227 static const struct {
228 const char *name;
229 int (*func)(int);
230 } wctypes[] = {
231 #define ADD(x) { .name = # x, .func = is ## x }
232 ADD(alnum),
233 ADD(alpha),
234 ADD(blank),
235 ADD(cntrl),
236 ADD(digit),
237 ADD(graph),
238 ADD(lower),
239 ADD(print),
240 ADD(punct),
241 ADD(space),
242 ADD(upper),
243 ADD(xdigit),
244 #undef ADD
245 };
246
247 wctype_t
248 __regex_wctype(const char *str)
249 {
250 for (size_t i = 0; i < __arraycount(wctypes); i++) {
251 if (strcmp(wctypes[i].name, str) == 0)
252 return (wctype_t)(i + 1);
253 }
254 return (wctype_t)0;
255 }
256
257 int
258 __regex_iswctype(wint_t c, wctype_t ct)
259 {
260 if (ct == 0)
261 return 0;
262 return (*wctypes[ct - 1].func)(c);
263 }
264 #endif
265
266 static int /* 0 success, otherwise REG_something */
267 regcomp_internal(regex_t * __restrict preg,
268 const char * __restrict pattern,
269 int cflags, int pflags)
270 {
271 struct parse pa;
272 struct re_guts *g;
273 struct parse *p = &pa;
274 int i;
275 size_t len;
276 size_t maxlen;
277 #ifdef REDEBUG
278 # define GOODFLAGS(f) (f)
279 #else
280 # define GOODFLAGS(f) ((f)&~REG_DUMP)
281 #endif
282
283 _DIAGASSERT(preg != NULL);
284 _DIAGASSERT(pattern != NULL);
285
286 cflags = GOODFLAGS(cflags);
287 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
288 return(REG_INVARG);
289
290 if (cflags®_PEND) {
291 if (preg->re_endp < pattern)
292 return(REG_INVARG);
293 len = preg->re_endp - pattern;
294 } else
295 len = strlen(pattern);
296
297 /* do the mallocs early so failure handling is easy */
298 g = malloc(sizeof(*g));
299 if (g == NULL)
300 return(REG_ESPACE);
301 /*
302 * Limit the pattern space to avoid a 32-bit overflow on buffer
303 * extension. Also avoid any signed overflow in case of conversion
304 * so make the real limit based on a 31-bit overflow.
305 *
306 * Likely not applicable on 64-bit systems but handle the case
307 * generically (who are we to stop people from using ~715MB+
308 * patterns?).
309 */
310 maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
311 if (len >= maxlen) {
312 free(g);
313 return(REG_ESPACE);
314 }
315 p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
316 assert(p->ssize >= len);
317
318 p->strip = calloc(p->ssize, sizeof(*p->strip));
319 p->slen = 0;
320 if (p->strip == NULL) {
321 free(g);
322 return(REG_ESPACE);
323 }
324
325 /* set things up */
326 p->g = g;
327 p->next = pattern; /* convenience; we do not modify it */
328 p->end = p->next + len;
329 p->error = 0;
330 p->ncsalloc = 0;
331 p->pflags = pflags;
332 for (i = 0; i < NPAREN; i++) {
333 p->pbegin[i] = 0;
334 p->pend[i] = 0;
335 }
336 #ifdef REGEX_GNU_EXTENSIONS
337 if ((cflags & REG_GNU) == 0) {
338 p->gnuext = false;
339 p->allowbranch = (cflags & REG_EXTENDED) != 0;
340 } else
341 p->gnuext = p->allowbranch = true;
342 #else
343 p->gnuext = false;
344 p->allowbranch = (cflags & REG_EXTENDED) != 0;
345 #endif
346 if (cflags & REG_EXTENDED) {
347 p->bre = false;
348 p->parse_expr = p_ere_exp;
349 p->pre_parse = NULL;
350 p->post_parse = NULL;
351 } else {
352 p->bre = true;
353 p->parse_expr = p_simp_re;
354 p->pre_parse = p_bre_pre_parse;
355 p->post_parse = p_bre_post_parse;
356 }
357 g->sets = NULL;
358 g->ncsets = 0;
359 g->cflags = cflags;
360 g->iflags = 0;
361 g->nbol = 0;
362 g->neol = 0;
363 g->must = NULL;
364 g->moffset = -1;
365 g->charjump = NULL;
366 g->matchjump = NULL;
367 g->mlen = 0;
368 g->nsub = 0;
369 g->backrefs = 0;
370
371 /* do it */
372 EMIT(OEND, 0);
373 g->firststate = THERE();
374 if (cflags & REG_NOSPEC)
375 p_str(p);
376 else
377 p_re(p, OUT, OUT);
378 EMIT(OEND, 0);
379 g->laststate = THERE();
380
381 /* tidy up loose ends and fill things in */
382 stripsnug(p, g);
383 findmust(p, g);
384 /* only use Boyer-Moore algorithm if the pattern is bigger
385 * than three characters
386 */
387 if(g->mlen > 3) {
388 computejumps(p, g);
389 computematchjumps(p, g);
390 if(g->matchjump == NULL && g->charjump != NULL) {
391 free(g->charjump);
392 g->charjump = NULL;
393 }
394 }
395 g->nplus = pluscount(p, g);
396 g->magic = MAGIC2;
397 preg->re_nsub = g->nsub;
398 preg->re_g = g;
399 preg->re_magic = MAGIC1;
400 #ifndef REDEBUG
401 /* not debugging, so can't rely on the assert() in regexec() */
402 if (g->iflags&BAD)
403 SETERROR(REG_ASSERT);
404 #endif
405
406 /* win or lose, we're done */
407 if (p->error != 0) /* lose */
408 regfree(preg);
409 return(p->error);
410 }
411
412 /*
413 - regcomp - interface for parser and compilation
414 = extern int regcomp(regex_t *, const char *, int);
415 = #define REG_BASIC 0000
416 = #define REG_EXTENDED 0001
417 = #define REG_ICASE 0002
418 = #define REG_NOSUB 0004
419 = #define REG_NEWLINE 0010
420 = #define REG_NOSPEC 0020
421 = #define REG_PEND 0040
422 = #define REG_DUMP 0200
423 */
424 int /* 0 success, otherwise REG_something */
425 regcomp(regex_t * __restrict preg,
426 const char * __restrict pattern,
427 int cflags)
428 {
429
430 return (regcomp_internal(preg, pattern, cflags, 0));
431 }
432
433 /*
434 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
435 - return whether we should terminate or not
436 == static bool p_ere_exp(struct parse *p);
437 */
438 static bool
439 p_ere_exp(struct parse *p, struct branchc *bc)
440 {
441 char c;
442 wint_t wc;
443 sopno pos;
444 int count;
445 int count2;
446 #ifdef REGEX_GNU_EXTENSIONS
447 size_t i;
448 int handled;
449 #endif
450 sopno subno;
451 int wascaret = 0;
452
453 _DIAGASSERT(p != NULL);
454
455 (void)bc;
456 assert(MORE()); /* caller should have ensured this */
457 c = GETNEXT();
458
459 #ifdef REGEX_GNU_EXTENSIONS
460 handled = 0;
461 #endif
462 pos = HERE();
463 switch (c) {
464 case '(':
465 (void)REQUIRE(MORE(), REG_EPAREN);
466 p->g->nsub++;
467 subno = (sopno)p->g->nsub;
468 if (subno < NPAREN)
469 p->pbegin[subno] = HERE();
470 EMIT(OLPAREN, subno);
471 if (!SEE(')'))
472 p_re(p, ')', IGN);
473 if (subno < NPAREN) {
474 p->pend[subno] = HERE();
475 assert(p->pend[subno] != 0);
476 }
477 EMIT(ORPAREN, subno);
478 (void)MUSTEAT(')', REG_EPAREN);
479 break;
480 #ifndef POSIX_MISTAKE
481 case ')': /* happens only if no current unmatched ( */
482 /*
483 * You may ask, why the ifndef? Because I didn't notice
484 * this until slightly too late for 1003.2, and none of the
485 * other 1003.2 regular-expression reviewers noticed it at
486 * all. So an unmatched ) is legal POSIX, at least until
487 * we can get it fixed.
488 */
489 SETERROR(REG_EPAREN);
490 break;
491 #endif
492 case '^':
493 EMIT(OBOL, 0);
494 p->g->iflags |= USEBOL;
495 p->g->nbol++;
496 wascaret = 1;
497 break;
498 case '$':
499 EMIT(OEOL, 0);
500 p->g->iflags |= USEEOL;
501 p->g->neol++;
502 break;
503 case '|':
504 SETERROR(REG_EMPTY);
505 break;
506 case '*':
507 case '+':
508 case '?':
509 case '{':
510 SETERROR(REG_BADRPT);
511 break;
512 case '.':
513 if (p->g->cflags®_NEWLINE)
514 nonnewline(p);
515 else
516 EMIT(OANY, 0);
517 break;
518 case '[':
519 p_bracket(p);
520 break;
521 case '\\':
522 (void)REQUIRE(MORE(), REG_EESCAPE);
523 wc = WGETNEXT();
524 #ifdef REGEX_GNU_EXTENSIONS
525 if (p->gnuext) {
526 handled = 1;
527 switch (wc) {
528 case '`':
529 EMIT(OBOS, 0);
530 break;
531 case '\'':
532 EMIT(OEOS, 0);
533 break;
534 case 'B':
535 EMIT(ONWBND, 0);
536 break;
537 case 'b':
538 EMIT(OWBND, 0);
539 break;
540 case 'W':
541 case 'w':
542 case 'S':
543 case 's':
544 p_b_pseudoclass(p, wc);
545 break;
546 case '1':
547 case '2':
548 case '3':
549 case '4':
550 case '5':
551 case '6':
552 case '7':
553 case '8':
554 case '9':
555 i = wc - '0';
556 assert(i < NPAREN);
557 if (p->pend[i] != 0) {
558 assert(i <= p->g->nsub);
559 EMIT(OBACK_, i);
560 assert(p->pbegin[i] != 0);
561 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
562 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
563 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
564 EMIT(O_BACK, i);
565 } else
566 SETERROR(REG_ESUBREG);
567 p->g->backrefs = 1;
568 break;
569 default:
570 handled = 0;
571 }
572 /* Don't proceed to the POSIX bits if we've already handled it */
573 if (handled)
574 break;
575 }
576 #endif
577 switch (wc) {
578 case '<':
579 EMIT(OBOW, 0);
580 break;
581 case '>':
582 EMIT(OEOW, 0);
583 break;
584 default:
585 if (may_escape(p, wc))
586 ordinary(p, wc);
587 else
588 SETERROR(REG_EESCAPE);
589 break;
590 }
591 break;
592 default:
593 if (p->error != 0)
594 return (false);
595 p->next--;
596 wc = WGETNEXT();
597 ordinary(p, wc);
598 break;
599 }
600
601 if (!MORE())
602 return (false);
603 c = PEEK();
604 /* we call { a repetition if followed by a digit */
605 if (!( c == '*' || c == '+' || c == '?' || c == '{'))
606 return (false); /* no repetition, we're done */
607 else if (c == '{')
608 (void)REQUIRE(MORE2() && \
609 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
610 NEXT();
611
612 (void)REQUIRE(!wascaret, REG_BADRPT);
613 switch (c) {
614 case '*': /* implemented as +? */
615 /* this case does not require the (y|) trick, noKLUDGE */
616 INSERT(OPLUS_, pos);
617 ASTERN(O_PLUS, pos);
618 INSERT(OQUEST_, pos);
619 ASTERN(O_QUEST, pos);
620 break;
621 case '+':
622 INSERT(OPLUS_, pos);
623 ASTERN(O_PLUS, pos);
624 break;
625 case '?':
626 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
627 INSERT(OCH_, pos); /* offset slightly wrong */
628 ASTERN(OOR1, pos); /* this one's right */
629 AHEAD(pos); /* fix the OCH_ */
630 EMIT(OOR2, 0); /* offset very wrong... */
631 AHEAD(THERE()); /* ...so fix it */
632 ASTERN(O_CH, THERETHERE());
633 break;
634 case '{':
635 count = p_count(p);
636 if (EAT(',')) {
637 if (isdigit((uch)PEEK())) {
638 count2 = p_count(p);
639 (void)REQUIRE(count <= count2, REG_BADBR);
640 } else /* single number with comma */
641 count2 = INFINITY;
642 } else /* just a single number */
643 count2 = count;
644 repeat(p, pos, count, count2);
645 if (!EAT('}')) { /* error heuristics */
646 while (MORE() && PEEK() != '}')
647 NEXT();
648 (void)REQUIRE(MORE(), REG_EBRACE);
649 SETERROR(REG_BADBR);
650 }
651 break;
652 }
653
654 if (!MORE())
655 return (false);
656 c = PEEK();
657 if (!( c == '*' || c == '+' || c == '?' ||
658 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
659 return (false);
660 SETERROR(REG_BADRPT);
661 return (false);
662 }
663
664 /*
665 - p_str - string (no metacharacters) "parser"
666 == static void p_str(struct parse *p);
667 */
668 static void
669 p_str(struct parse *p)
670 {
671 (void)REQUIRE(MORE(), REG_EMPTY);
672 while (MORE())
673 ordinary(p, WGETNEXT());
674 }
675
676 /*
677 * Eat consecutive branch delimiters for the kind of expression that we are
678 * parsing, return the number of delimiters that we ate.
679 */
680 static int
681 p_branch_eat_delim(struct parse *p, struct branchc *bc)
682 {
683 int nskip;
684
685 (void)bc;
686 nskip = 0;
687 while (EATSPEC('|'))
688 ++nskip;
689 return (nskip);
690 }
691
692 /*
693 * Insert necessary branch book-keeping operations. This emits a
694 * bogus 'next' offset, since we still have more to parse
695 */
696 static void
697 p_branch_ins_offset(struct parse *p, struct branchc *bc)
698 {
699
700 if (bc->nbranch == 0) {
701 INSERT(OCH_, bc->start); /* offset is wrong */
702 bc->fwd = bc->start;
703 bc->back = bc->start;
704 }
705
706 ASTERN(OOR1, bc->back);
707 bc->back = THERE();
708 AHEAD(bc->fwd); /* fix previous offset */
709 bc->fwd = HERE();
710 EMIT(OOR2, 0); /* offset is very wrong */
711 ++bc->nbranch;
712 }
713
714 /*
715 * Fix the offset of the tail branch, if we actually had any branches.
716 * This is to correct the bogus placeholder offset that we use.
717 */
718 static void
719 p_branch_fix_tail(struct parse *p, struct branchc *bc)
720 {
721
722 /* Fix bogus offset at the tail if we actually have branches */
723 if (bc->nbranch > 0) {
724 AHEAD(bc->fwd);
725 ASTERN(O_CH, bc->back);
726 }
727 }
728
729 /*
730 * Signal to the parser that an empty branch has been encountered; this will,
731 * in the future, be used to allow for more permissive behavior with empty
732 * branches. The return value should indicate whether parsing may continue
733 * or not.
734 */
735 static bool
736 p_branch_empty(struct parse *p, struct branchc *bc)
737 {
738
739 (void)bc;
740 SETERROR(REG_EMPTY);
741 return (false);
742 }
743
744 /*
745 * Take care of any branching requirements. This includes inserting the
746 * appropriate branching instructions as well as eating all of the branch
747 * delimiters until we either run out of pattern or need to parse more pattern.
748 */
749 static bool
750 p_branch_do(struct parse *p, struct branchc *bc)
751 {
752 int ate = 0;
753
754 ate = p_branch_eat_delim(p, bc);
755 if (ate == 0)
756 return (false);
757 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
758 /*
759 * Halt parsing only if we have an empty branch and p_branch_empty
760 * indicates that we must not continue. In the future, this will not
761 * necessarily be an error.
762 */
763 return (false);
764 p_branch_ins_offset(p, bc);
765
766 return (true);
767 }
768
769 static void
770 p_bre_pre_parse(struct parse *p, struct branchc *bc)
771 {
772
773 (void)bc;
774 /*
775 * Does not move cleanly into expression parser because of
776 * ordinary interpration of * at the beginning position of
777 * an expression.
778 */
779 if (EAT('^')) {
780 EMIT(OBOL, 0);
781 p->g->iflags |= USEBOL;
782 p->g->nbol++;
783 }
784 }
785
786 static void
787 p_bre_post_parse(struct parse *p, struct branchc *bc)
788 {
789
790 /* Expression is terminating due to EOL token */
791 if (bc->terminate) {
792 DROP(1);
793 EMIT(OEOL, 0);
794 p->g->iflags |= USEEOL;
795 p->g->neol++;
796 }
797 }
798
799 /*
800 - p_re - Top level parser, concatenation and BRE anchoring
801 == static void p_re(struct parse *p, int end1, int end2);
802 * Giving end1 as OUT essentially eliminates the end1/end2 check.
803 *
804 * This implementation is a bit of a kludge, in that a trailing $ is first
805 * taken as an ordinary character and then revised to be an anchor.
806 * The amount of lookahead needed to avoid this kludge is excessive.
807 */
808 static void
809 p_re(struct parse *p,
810 int end1, /* first terminating character */
811 int end2) /* second terminating character; ignored for EREs */
812 {
813 struct branchc bc;
814
815 bc.nbranch = 0;
816 if (end1 == OUT && end2 == OUT)
817 bc.outer = true;
818 else
819 bc.outer = false;
820 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
821 for (;;) {
822 bc.start = HERE();
823 bc.nchain = 0;
824 bc.terminate = false;
825 if (p->pre_parse != NULL)
826 p->pre_parse(p, &bc);
827 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
828 bc.terminate = p->parse_expr(p, &bc);
829 ++bc.nchain;
830 }
831 if (p->post_parse != NULL)
832 p->post_parse(p, &bc);
833 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
834 #ifdef REGEX_GNU_EXTENSIONS
835 if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
836 break;
837 #endif
838 if (!p->allowbranch)
839 break;
840 /*
841 * p_branch_do's return value indicates whether we should
842 * continue parsing or not. This is both for correctness and
843 * a slight optimization, because it will check if we've
844 * encountered an empty branch or the end of the string
845 * immediately following a branch delimiter.
846 */
847 if (!p_branch_do(p, &bc))
848 break;
849 }
850 #undef SEE_END
851 if (p->allowbranch)
852 p_branch_fix_tail(p, &bc);
853 assert(!MORE() || SEE(end1));
854 }
855
856 /*
857 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
858 == static bool p_simp_re(struct parse *p, struct branchc *bc);
859 */
860 static bool /* was the simple RE an unbackslashed $? */
861 p_simp_re(struct parse *p, struct branchc *bc)
862 {
863 int c;
864 int cc; /* convenient/control character */
865 int count;
866 int count2;
867 sopno pos;
868 bool handled;
869 size_t i;
870 wint_t wc;
871 sopno subno;
872 # define BACKSL (1<<CHAR_BIT)
873
874 pos = HERE(); /* repetition op, if any, covers from here */
875 handled = false;
876
877 assert(MORE()); /* caller should have ensured this */
878 c = GETNEXT();
879 if (c == '\\') {
880 (void)REQUIRE(MORE(), REG_EESCAPE);
881 cc = GETNEXT();
882 c = BACKSL | cc;
883 #ifdef REGEX_GNU_EXTENSIONS
884 if (p->gnuext) {
885 handled = true;
886 switch (c) {
887 case BACKSL|'`':
888 EMIT(OBOS, 0);
889 break;
890 case BACKSL|'\'':
891 EMIT(OEOS, 0);
892 break;
893 case BACKSL|'B':
894 EMIT(ONWBND, 0);
895 break;
896 case BACKSL|'b':
897 EMIT(OWBND, 0);
898 break;
899 case BACKSL|'W':
900 case BACKSL|'w':
901 case BACKSL|'S':
902 case BACKSL|'s':
903 p_b_pseudoclass(p, cc);
904 break;
905 default:
906 handled = false;
907 }
908 }
909 #endif
910 }
911 if (!handled) {
912 switch (c) {
913 case '.':
914 if (p->g->cflags®_NEWLINE)
915 nonnewline(p);
916 else
917 EMIT(OANY, 0);
918 break;
919 case '[':
920 p_bracket(p);
921 break;
922 case BACKSL|'<':
923 EMIT(OBOW, 0);
924 break;
925 case BACKSL|'>':
926 EMIT(OEOW, 0);
927 break;
928 case BACKSL|'{':
929 SETERROR(REG_BADRPT);
930 break;
931 case BACKSL|'(':
932 p->g->nsub++;
933 subno = (sopno)p->g->nsub;
934 if (subno < NPAREN)
935 p->pbegin[subno] = HERE();
936 EMIT(OLPAREN, subno);
937 /* the MORE here is an error heuristic */
938 if (MORE() && !SEETWO('\\', ')'))
939 p_re(p, '\\', ')');
940 if (subno < NPAREN) {
941 p->pend[subno] = HERE();
942 assert(p->pend[subno] != 0);
943 }
944 EMIT(ORPAREN, subno);
945 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
946 break;
947 case BACKSL|')': /* should not get here -- must be user */
948 SETERROR(REG_EPAREN);
949 break;
950 case BACKSL|'1':
951 case BACKSL|'2':
952 case BACKSL|'3':
953 case BACKSL|'4':
954 case BACKSL|'5':
955 case BACKSL|'6':
956 case BACKSL|'7':
957 case BACKSL|'8':
958 case BACKSL|'9':
959 i = (c&~BACKSL) - '0';
960 assert(i < NPAREN);
961 if (p->pend[i] != 0) {
962 assert(i <= p->g->nsub);
963 EMIT(OBACK_, i);
964 assert(p->pbegin[i] != 0);
965 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
966 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
967 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
968 EMIT(O_BACK, i);
969 } else
970 SETERROR(REG_ESUBREG);
971 p->g->backrefs = 1;
972 break;
973 case '*':
974 /*
975 * Ordinary if used as the first character beyond BOL anchor of
976 * a (sub-)expression, counts as a bad repetition operator if it
977 * appears otherwise.
978 */
979 (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
980 /* FALLTHROUGH */
981 default:
982 if (p->error != 0)
983 return (false); /* Definitely not $... */
984 p->next--;
985 wc = WGETNEXT();
986 if ((c & BACKSL) == 0 || may_escape(p, wc))
987 ordinary(p, wc);
988 else
989 SETERROR(REG_EESCAPE);
990 break;
991 }
992 }
993
994 if (EAT('*')) { /* implemented as +? */
995 /* this case does not require the (y|) trick, noKLUDGE */
996 INSERT(OPLUS_, pos);
997 ASTERN(O_PLUS, pos);
998 INSERT(OQUEST_, pos);
999 ASTERN(O_QUEST, pos);
1000 #ifdef REGEX_GNU_EXTENSIONS
1001 } else if (p->gnuext && EATTWO('\\', '?')) {
1002 INSERT(OQUEST_, pos);
1003 ASTERN(O_QUEST, pos);
1004 } else if (p->gnuext && EATTWO('\\', '+')) {
1005 INSERT(OPLUS_, pos);
1006 ASTERN(O_PLUS, pos);
1007 #endif
1008 } else if (EATTWO('\\', '{')) {
1009 count = p_count(p);
1010 if (EAT(',')) {
1011 if (MORE() && isdigit((uch)PEEK())) {
1012 count2 = p_count(p);
1013 (void)REQUIRE(count <= count2, REG_BADBR);
1014 } else /* single number with comma */
1015 count2 = INFINITY;
1016 } else /* just a single number */
1017 count2 = count;
1018 repeat(p, pos, count, count2);
1019 if (!EATTWO('\\', '}')) { /* error heuristics */
1020 while (MORE() && !SEETWO('\\', '}'))
1021 NEXT();
1022 (void)REQUIRE(MORE(), REG_EBRACE);
1023 SETERROR(REG_BADBR);
1024 }
1025 } else if (c == '$') /* $ (but not \$) ends it */
1026 return (true);
1027
1028 return (false);
1029 }
1030
1031 /*
1032 - p_count - parse a repetition count
1033 == static int p_count(struct parse *p);
1034 */
1035 static int /* the value */
1036 p_count(struct parse *p)
1037 {
1038 int count = 0;
1039 int ndigits = 0;
1040
1041 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1042 count = count*10 + (GETNEXT() - '0');
1043 ndigits++;
1044 }
1045
1046 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1047 return(count);
1048 }
1049
1050 /*
1051 - p_bracket - parse a bracketed character list
1052 == static void p_bracket(struct parse *p);
1053 */
1054 static void
1055 p_bracket(struct parse *p)
1056 {
1057 cset *cs;
1058 wint_t ch;
1059
1060 /* Dept of Truly Sickening Special-Case Kludges */
1061 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1062 EMIT(OBOW, 0);
1063 NEXTn(6);
1064 return;
1065 }
1066 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1067 EMIT(OEOW, 0);
1068 NEXTn(6);
1069 return;
1070 }
1071
1072 if ((cs = allocset(p)) == NULL)
1073 return;
1074
1075 if (p->g->cflags®_ICASE)
1076 cs->icase = 1;
1077 if (EAT('^'))
1078 cs->invert = 1;
1079 if (EAT(']'))
1080 CHadd(p, cs, ']');
1081 else if (EAT('-'))
1082 CHadd(p, cs, '-');
1083 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1084 p_b_term(p, cs);
1085 if (EAT('-'))
1086 CHadd(p, cs, '-');
1087 (void)MUSTEAT(']', REG_EBRACK);
1088
1089 if (p->error != 0) /* don't mess things up further */
1090 return;
1091
1092 if (cs->invert && p->g->cflags®_NEWLINE)
1093 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1094
1095 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1096 ordinary(p, ch);
1097 freeset(p, cs);
1098 } else
1099 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1100 }
1101
1102 static int
1103 p_range_cmp(wchar_t c1, wchar_t c2)
1104 {
1105 #ifdef REGEX_LIBC_COLLATE
1106 return __wcollate_range_cmp(c1, c2);
1107 #elif defined(NLS)
1108 /* Copied from libc/collate __wcollate_range_cmp */
1109 wchar_t s1[2], s2[2];
1110
1111 s1[0] = c1;
1112 s1[1] = L'\0';
1113 s2[0] = c2;
1114 s2[1] = L'\0';
1115 return wcscoll(s1, s2);
1116 #else
1117 char s1[2], s2[2];
1118
1119 s1[0] = (char)c1;
1120 s1[1] = '\0';
1121 s2[0] = (char)c2;
1122 s2[1] = '\0';
1123 return strcoll(s1, s2);
1124 #endif
1125 }
1126
1127 /*
1128 - p_b_term - parse one term of a bracketed character list
1129 == static void p_b_term(struct parse *p, cset *cs);
1130 */
1131 static void
1132 p_b_term(struct parse *p, cset *cs)
1133 {
1134 char c;
1135 wint_t start, finish;
1136 wint_t i;
1137 #ifdef REGEX_LIBC_COLLATE
1138 struct xlocale_collate *table =
1139 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1140 #endif
1141
1142 _DIAGASSERT(p != NULL);
1143 _DIAGASSERT(cs != NULL);
1144
1145 /* classify what we've got */
1146 switch ((MORE()) ? PEEK() : '\0') {
1147 case '[':
1148 c = (MORE2()) ? PEEK2() : '\0';
1149 break;
1150 case '-':
1151 SETERROR(REG_ERANGE);
1152 return; /* NOTE RETURN */
1153 default:
1154 c = '\0';
1155 break;
1156 }
1157
1158 switch (c) {
1159 case ':': /* character class */
1160 NEXT2();
1161 (void)REQUIRE(MORE(), REG_EBRACK);
1162 c = PEEK();
1163 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1164 p_b_cclass(p, cs);
1165 (void)REQUIRE(MORE(), REG_EBRACK);
1166 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1167 break;
1168 case '=': /* equivalence class */
1169 NEXT2();
1170 (void)REQUIRE(MORE(), REG_EBRACK);
1171 c = PEEK();
1172 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1173 p_b_eclass(p, cs);
1174 (void)REQUIRE(MORE(), REG_EBRACK);
1175 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1176 break;
1177 default: /* symbol, ordinary character, or range */
1178 start = p_b_symbol(p);
1179 if (SEE('-') && MORE2() && PEEK2() != ']') {
1180 /* range */
1181 NEXT();
1182 if (EAT('-'))
1183 finish = '-';
1184 else
1185 finish = p_b_symbol(p);
1186 } else
1187 finish = start;
1188 if (start == finish)
1189 CHadd(p, cs, start);
1190 else {
1191 #ifdef REGEX_LIBC_COLLATE
1192 if (table->__collate_load_error || MB_CUR_MAX > 1) {
1193 #else
1194 if (MB_CUR_MAX > 1) {
1195 #endif
1196 (void)REQUIRE(start <= finish, REG_ERANGE);
1197 CHaddrange(p, cs, start, finish);
1198 } else {
1199 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1200 for (i = 0; i <= UCHAR_MAX; i++) {
1201 if (p_range_cmp(start, i) <= 0 &&
1202 p_range_cmp(i, finish) <= 0 )
1203 CHadd(p, cs, i);
1204 }
1205 }
1206 }
1207 break;
1208 }
1209 }
1210
1211 #ifdef REGEX_GNU_EXTENSIONS
1212 /*
1213 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1214 == static int p_b_pseudoclass(struct parse *p, char c)
1215 */
1216 static int
1217 p_b_pseudoclass(struct parse *p, char c) {
1218 cset *cs;
1219
1220 if ((cs = allocset(p)) == NULL)
1221 return(0);
1222
1223 if (p->g->cflags®_ICASE)
1224 cs->icase = 1;
1225
1226 switch (c) {
1227 case 'W':
1228 cs->invert = 1;
1229 /* FALLTHROUGH */
1230 case 'w':
1231 p_b_cclass_named(p, cs, "alnum");
1232 break;
1233 case 'S':
1234 cs->invert = 1;
1235 /* FALLTHROUGH */
1236 case 's':
1237 p_b_cclass_named(p, cs, "space");
1238 break;
1239 default:
1240 return(0);
1241 }
1242
1243 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1244 return(1);
1245 }
1246 #endif
1247
1248 /*
1249 - p_b_cclass - parse a character-class name and deal with it
1250 == static void p_b_cclass(struct parse *p, cset *cs);
1251 */
1252 static void
1253 p_b_cclass(struct parse *p, cset *cs)
1254 {
1255 const char *sp = p->next;
1256 size_t len;
1257 char clname[16];
1258
1259 while (MORE() && isalpha((uch)PEEK()))
1260 NEXT();
1261 len = p->next - sp;
1262 if (len >= sizeof(clname) - 1) {
1263 SETERROR(REG_ECTYPE);
1264 return;
1265 }
1266 memcpy(clname, sp, len);
1267 clname[len] = '\0';
1268
1269 p_b_cclass_named(p, cs, clname);
1270 }
1271
1272 /*
1273 - p_b_cclass_named - deal with a named character class
1274 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1275 */
1276 static void
1277 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1278 wctype_t wct;
1279
1280 if ((wct = wctype(clname)) == 0) {
1281 SETERROR(REG_ECTYPE);
1282 return;
1283 }
1284 CHaddtype(p, cs, wct);
1285 }
1286
1287 /*
1288 - p_b_eclass - parse an equivalence-class name and deal with it
1289 == static void p_b_eclass(struct parse *p, cset *cs);
1290 *
1291 * This implementation is incomplete. xxx
1292 */
1293 static void
1294 p_b_eclass(struct parse *p, cset *cs)
1295 {
1296 wint_t c;
1297
1298 _DIAGASSERT(p != NULL);
1299 _DIAGASSERT(cs != NULL);
1300
1301 c = p_b_coll_elem(p, '=');
1302 CHadd(p, cs, c);
1303 }
1304
1305 /*
1306 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1307 == static wint_t p_b_symbol(struct parse *p);
1308 */
1309 static wint_t /* value of symbol */
1310 p_b_symbol(struct parse *p)
1311 {
1312 wint_t value;
1313
1314 _DIAGASSERT(p != NULL);
1315
1316 (void)REQUIRE(MORE(), REG_EBRACK);
1317 if (!EATTWO('[', '.'))
1318 return(WGETNEXT());
1319
1320 /* collating symbol */
1321 value = p_b_coll_elem(p, '.');
1322 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1323 return(value);
1324 }
1325
1326 /*
1327 - p_b_coll_elem - parse a collating-element name and look it up
1328 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1329 */
1330 static wint_t /* value of collating element */
1331 p_b_coll_elem(struct parse *p,
1332 wint_t endc) /* name ended by endc,']' */
1333 {
1334 const char *sp = p->next;
1335 struct cname *cp;
1336 size_t len;
1337
1338 _DIAGASSERT(p != NULL);
1339
1340 while (MORE() && !SEETWO(endc, ']'))
1341 NEXT();
1342 if (!MORE()) {
1343 SETERROR(REG_EBRACK);
1344 return(0);
1345 }
1346 len = p->next - sp;
1347 for (cp = cnames; cp->name != NULL; cp++)
1348 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1349 return(cp->code); /* known name */
1350 #ifdef NLS
1351 mbstate_t mbs;
1352 wchar_t wc;
1353 size_t clen;
1354
1355 memset(&mbs, 0, sizeof(mbs));
1356 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1357 return (wc); /* single character */
1358 else if (clen == (size_t)-1 || clen == (size_t)-2)
1359 SETERROR(REG_ILLSEQ);
1360 else
1361 SETERROR(REG_ECOLLATE); /* neither */
1362 return(0);
1363 #else
1364 if (len == 1)
1365 return *sp; /* single character */
1366 SETERROR(REG_ECOLLATE); /* neither */
1367 return 0;
1368 #endif
1369 }
1370
1371 /*
1372 - may_escape - determine whether 'ch' is escape-able in the current context
1373 == static int may_escape(struct parse *p, const wint_t ch)
1374 */
1375 static bool
1376 may_escape(struct parse *p, const wint_t ch)
1377 {
1378
1379 if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1380 return (true);
1381 if (isalpha(ch) || ch == '\'' || ch == '`')
1382 return (false);
1383 return (true);
1384 #ifdef NOTYET
1385 /*
1386 * Build a whitelist of characters that may be escaped to produce an
1387 * ordinary in the current context. This assumes that these have not
1388 * been otherwise interpreted as a special character. Escaping an
1389 * ordinary character yields undefined results according to
1390 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1391 * advantage of this and use escaped ordinary characters to provide
1392 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1393 */
1394 switch(ch) {
1395 case '|':
1396 case '+':
1397 case '?':
1398 /* The above characters may not be escaped in BREs */
1399 if (!(p->g->cflags®_EXTENDED))
1400 return (false);
1401 /* Fallthrough */
1402 case '(':
1403 case ')':
1404 case '{':
1405 case '}':
1406 case '.':
1407 case '[':
1408 case ']':
1409 case '\\':
1410 case '*':
1411 case '^':
1412 case '$':
1413 return (true);
1414 default:
1415 return (false);
1416 }
1417 #endif
1418 }
1419
1420 /*
1421 - othercase - return the case counterpart of an alphabetic
1422 == static wint_t othercase(wint_t ch);
1423 */
1424 static wint_t /* if no counterpart, return ch */
1425 othercase(wint_t ch)
1426 {
1427 assert(iswalpha(ch));
1428 if (iswupper(ch))
1429 return(towlower(ch));
1430 else if (iswlower(ch))
1431 return(towupper(ch));
1432 else /* peculiar, but could happen */
1433 return(ch);
1434 }
1435
1436 /*
1437 - bothcases - emit a dualcase version of a two-case character
1438 == static void bothcases(struct parse *p, wint_t ch);
1439 *
1440 * Boy, is this implementation ever a kludge...
1441 */
1442 static void
1443 bothcases(struct parse *p, wint_t ch)
1444 {
1445 const char *oldnext = p->next;
1446 const char *oldend = p->end;
1447 char bracket[3 + MB_LEN_MAX];
1448 size_t n;
1449
1450 _DIAGASSERT(p != NULL);
1451
1452 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1453 p->next = bracket;
1454 #ifdef NLS
1455 mbstate_t mbs;
1456 memset(&mbs, 0, sizeof(mbs));
1457 n = wcrtomb(bracket, ch, &mbs);
1458 assert(n != (size_t)-1);
1459 #else
1460 n = 0;
1461 bracket[n++] = ch;
1462 #endif
1463 bracket[n] = ']';
1464 bracket[n + 1] = '\0';
1465 p->end = bracket+n+1;
1466 p_bracket(p);
1467 assert(p->next == p->end);
1468 p->next = oldnext;
1469 p->end = oldend;
1470 }
1471
1472 /*
1473 - ordinary - emit an ordinary character
1474 == static void ordinary(struct parse *p, wint_t ch);
1475 */
1476 static void
1477 ordinary(struct parse *p, wint_t ch)
1478 {
1479 cset *cs;
1480
1481 _DIAGASSERT(p != NULL);
1482
1483 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1484 bothcases(p, ch);
1485 else if ((wint_t)(ch & OPDMASK) == ch)
1486 EMIT(OCHAR, (size_t)ch);
1487 else {
1488 /*
1489 * Kludge: character is too big to fit into an OCHAR operand.
1490 * Emit a singleton set.
1491 */
1492 if ((cs = allocset(p)) == NULL)
1493 return;
1494 CHadd(p, cs, ch);
1495 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1496 }
1497 }
1498
1499 /*
1500 - nonnewline - emit REG_NEWLINE version of OANY
1501 == static void nonnewline(struct parse *p);
1502 *
1503 * Boy, is this implementation ever a kludge...
1504 */
1505 static void
1506 nonnewline(struct parse *p)
1507 {
1508 const char *oldnext = p->next;
1509 const char *oldend = p->end;
1510 char bracket[4];
1511
1512 _DIAGASSERT(p != NULL);
1513
1514 p->next = bracket;
1515 p->end = bracket+3;
1516 bracket[0] = '^';
1517 bracket[1] = '\n';
1518 bracket[2] = ']';
1519 bracket[3] = '\0';
1520 p_bracket(p);
1521 assert(p->next == bracket+3);
1522 p->next = oldnext;
1523 p->end = oldend;
1524 }
1525
1526 /*
1527 - repeat - generate code for a bounded repetition, recursively if needed
1528 == static void repeat(struct parse *p, sopno start, int from, int to);
1529 */
1530 static void
1531 repeat(struct parse *p,
1532 sopno start, /* operand from here to end of strip */
1533 int from, /* repeated from this number */
1534 int to) /* to this number of times (maybe INFINITY) */
1535 {
1536 sopno finish = HERE();
1537 # define N 2
1538 # define INF 3
1539 # define REP(f, t) ((f)*8 + (t))
1540 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1541 sopno copy;
1542
1543 _DIAGASSERT(p != NULL);
1544
1545 if (p->error != 0) /* head off possible runaway recursion */
1546 return;
1547
1548 assert(from <= to);
1549
1550 switch (REP(MAP(from), MAP(to))) {
1551 case REP(0, 0): /* must be user doing this */
1552 DROP(finish-start); /* drop the operand */
1553 break;
1554 case REP(0, 1): /* as x{1,1}? */
1555 case REP(0, N): /* as x{1,n}? */
1556 case REP(0, INF): /* as x{1,}? */
1557 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1558 INSERT(OCH_, start); /* offset is wrong... */
1559 repeat(p, start+1, 1, to);
1560 ASTERN(OOR1, start);
1561 AHEAD(start); /* ... fix it */
1562 EMIT(OOR2, 0);
1563 AHEAD(THERE());
1564 ASTERN(O_CH, THERETHERE());
1565 break;
1566 case REP(1, 1): /* trivial case */
1567 /* done */
1568 break;
1569 case REP(1, N): /* as x?x{1,n-1} */
1570 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1571 INSERT(OCH_, start);
1572 ASTERN(OOR1, start);
1573 AHEAD(start);
1574 EMIT(OOR2, 0); /* offset very wrong... */
1575 AHEAD(THERE()); /* ...so fix it */
1576 ASTERN(O_CH, THERETHERE());
1577 copy = dupl(p, start+1, finish+1);
1578 assert(copy == finish+4);
1579 repeat(p, copy, 1, to-1);
1580 break;
1581 case REP(1, INF): /* as x+ */
1582 INSERT(OPLUS_, start);
1583 ASTERN(O_PLUS, start);
1584 break;
1585 case REP(N, N): /* as xx{m-1,n-1} */
1586 copy = dupl(p, start, finish);
1587 repeat(p, copy, from-1, to-1);
1588 break;
1589 case REP(N, INF): /* as xx{n-1,INF} */
1590 copy = dupl(p, start, finish);
1591 repeat(p, copy, from-1, to);
1592 break;
1593 default: /* "can't happen" */
1594 SETERROR(REG_ASSERT); /* just in case */
1595 break;
1596 }
1597 }
1598
1599 /*
1600 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1601 - character from the parse struct, signals a REG_ILLSEQ error if the
1602 - character can't be converted. Returns the number of bytes consumed.
1603 */
1604 static wint_t
1605 wgetnext(struct parse *p)
1606 {
1607 #ifdef NLS
1608 mbstate_t mbs;
1609 wchar_t wc;
1610 size_t n;
1611
1612 memset(&mbs, 0, sizeof(mbs));
1613 n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1614 if (n == (size_t)-1 || n == (size_t)-2) {
1615 SETERROR(REG_ILLSEQ);
1616 return (0);
1617 }
1618 if (n == 0)
1619 n = 1;
1620 p->next += n;
1621 return wc;
1622 #else
1623 return *p->next++;
1624 #endif
1625 }
1626
1627 /*
1628 - seterr - set an error condition
1629 == static int seterr(struct parse *p, int e);
1630 */
1631 static int /* useless but makes type checking happy */
1632 seterr(struct parse *p, int e)
1633 {
1634
1635 _DIAGASSERT(p != NULL);
1636
1637 if (p->error == 0) /* keep earliest error condition */
1638 p->error = e;
1639 p->next = nuls; /* try to bring things to a halt */
1640 p->end = nuls;
1641 return(0); /* make the return value well-defined */
1642 }
1643
1644 /*
1645 - allocset - allocate a set of characters for []
1646 == static cset *allocset(struct parse *p);
1647 */
1648 static cset *
1649 allocset(struct parse *p)
1650 {
1651 cset *cs, *ncs;
1652
1653 _DIAGASSERT(p != NULL);
1654
1655 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1656 if (ncs == NULL) {
1657 SETERROR(REG_ESPACE);
1658 return (NULL);
1659 }
1660 p->g->sets = ncs;
1661 cs = &p->g->sets[p->g->ncsets++];
1662 memset(cs, 0, sizeof(*cs));
1663
1664 return(cs);
1665 }
1666
1667 /*
1668 - freeset - free a now-unused set
1669 == static void freeset(struct parse *p, cset *cs);
1670 */
1671 static void
1672 freeset(struct parse *p, cset *cs)
1673 {
1674 cset *top;
1675
1676 _DIAGASSERT(p != NULL);
1677 _DIAGASSERT(cs != NULL);
1678
1679 top = &p->g->sets[p->g->ncsets];
1680
1681 free(cs->wides);
1682 free(cs->ranges);
1683 free(cs->types);
1684 memset(cs, 0, sizeof(*cs));
1685 if (cs == top-1) /* recover only the easy case */
1686 p->g->ncsets--;
1687 }
1688
1689 /*
1690 - singleton - Determine whether a set contains only one character,
1691 - returning it if so, otherwise returning OUT.
1692 */
1693 static wint_t
1694 singleton(cset *cs)
1695 {
1696 wint_t i, s, n;
1697
1698 for (i = n = 0; i < NC; i++)
1699 if (CHIN(cs, i)) {
1700 n++;
1701 s = i;
1702 }
1703 if (n == 1)
1704 return (s);
1705 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1706 cs->icase == 0)
1707 return (cs->wides[0]);
1708 /* Don't bother handling the other cases. */
1709 return (OUT);
1710 }
1711
1712 /*
1713 - CHadd - add character to character set.
1714 */
1715 static void
1716 CHadd(struct parse *p, cset *cs, wint_t ch)
1717 {
1718 wint_t nch, *newwides;
1719
1720 _DIAGASSERT(p != NULL);
1721 _DIAGASSERT(cs != NULL);
1722
1723 assert(ch >= 0);
1724 if (ch < NC)
1725 cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1726 else {
1727 newwides = reallocarray(cs->wides, cs->nwides + 1,
1728 sizeof(*cs->wides));
1729 if (newwides == NULL) {
1730 SETERROR(REG_ESPACE);
1731 return;
1732 }
1733 cs->wides = newwides;
1734 cs->wides[cs->nwides++] = ch;
1735 }
1736 if (cs->icase) {
1737 if ((nch = towlower(ch)) < NC)
1738 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1739 if ((nch = towupper(ch)) < NC)
1740 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1741 }
1742 }
1743
1744 /*
1745 - CHaddrange - add all characters in the range [min,max] to a character set.
1746 */
1747 static void
1748 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1749 {
1750 crange *newranges;
1751
1752 _DIAGASSERT(p != NULL);
1753 _DIAGASSERT(cs != NULL);
1754
1755 for (; min < NC && min <= max; min++)
1756 CHadd(p, cs, min);
1757 if (min >= max)
1758 return;
1759 newranges = reallocarray(cs->ranges, cs->nranges + 1,
1760 sizeof(*cs->ranges));
1761 if (newranges == NULL) {
1762 SETERROR(REG_ESPACE);
1763 return;
1764 }
1765 cs->ranges = newranges;
1766 cs->ranges[cs->nranges].min = min;
1767 cs->ranges[cs->nranges].max = max;
1768 cs->nranges++;
1769 }
1770
1771 /*
1772 - CHaddtype - add all characters of a certain type to a character set.
1773 */
1774 static void
1775 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1776 {
1777 wint_t i;
1778 wctype_t *newtypes;
1779
1780 _DIAGASSERT(p != NULL);
1781 _DIAGASSERT(cs != NULL);
1782
1783 for (i = 0; i < NC; i++)
1784 if (iswctype(i, wct))
1785 CHadd(p, cs, i);
1786 newtypes = reallocarray(cs->types, cs->ntypes + 1,
1787 sizeof(*cs->types));
1788 if (newtypes == NULL) {
1789 SETERROR(REG_ESPACE);
1790 return;
1791 }
1792 cs->types = newtypes;
1793 cs->types[cs->ntypes++] = wct;
1794 }
1795
1796 /*
1797 - dupl - emit a duplicate of a bunch of sops
1798 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1799 */
1800 static sopno /* start of duplicate */
1801 dupl(struct parse *p,
1802 sopno start, /* from here */
1803 sopno finish) /* to this less one */
1804 {
1805 sopno ret = HERE();
1806 sopno len = finish - start;
1807
1808 _DIAGASSERT(p != NULL);
1809
1810 assert(finish >= start);
1811 if (len == 0)
1812 return(ret);
1813 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1814 return(ret);
1815 (void) memcpy(p->strip + p->slen,
1816 p->strip + start, len * sizeof(*p->strip));
1817 p->slen += len;
1818 return(ret);
1819 }
1820
1821 /*
1822 - doemit - emit a strip operator
1823 == static void doemit(struct parse *p, sop op, size_t opnd);
1824 *
1825 * It might seem better to implement this as a macro with a function as
1826 * hard-case backup, but it's just too big and messy unless there are
1827 * some changes to the data structures. Maybe later.
1828 */
1829 static void
1830 doemit(struct parse *p, sop op, size_t opnd)
1831 {
1832 /* avoid making error situations worse */
1833 if (p->error != 0)
1834 return;
1835
1836 _DIAGASSERT(p != NULL);
1837
1838 /* deal with oversize operands ("can't happen", more or less) */
1839 assert(opnd < 1<<OPSHIFT);
1840
1841 /* deal with undersized strip */
1842 if (p->slen >= p->ssize)
1843 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1844 return;
1845
1846 /* finally, it's all reduced to the easy case */
1847 p->strip[p->slen++] = (sopno)SOP(op, opnd);
1848 }
1849
1850 /*
1851 - doinsert - insert a sop into the strip
1852 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1853 */
1854 static void
1855 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1856 {
1857 sopno sn;
1858 sop s;
1859 int i;
1860
1861 _DIAGASSERT(p != NULL);
1862
1863 /* avoid making error situations worse */
1864 if (p->error != 0)
1865 return;
1866
1867 sn = HERE();
1868 EMIT(op, opnd); /* do checks, ensure space */
1869 assert(HERE() == sn+1);
1870 s = p->strip[sn];
1871
1872 /* adjust paren pointers */
1873 assert(pos > 0);
1874 for (i = 1; i < NPAREN; i++) {
1875 if (p->pbegin[i] >= pos) {
1876 p->pbegin[i]++;
1877 }
1878 if (p->pend[i] >= pos) {
1879 p->pend[i]++;
1880 }
1881 }
1882
1883 memmove(&p->strip[pos+1], &p->strip[pos],
1884 (HERE()-pos-1)*sizeof(*p->strip));
1885 p->strip[pos] = s;
1886 }
1887
1888 /*
1889 - dofwd - complete a forward reference
1890 == static void dofwd(struct parse *p, sopno pos, sop value);
1891 */
1892 static void
1893 dofwd(struct parse *p, sopno pos, sop value)
1894 {
1895
1896 _DIAGASSERT(p != NULL);
1897
1898 /* avoid making error situations worse */
1899 if (p->error != 0)
1900 return;
1901
1902 assert(value < 1<<OPSHIFT);
1903 p->strip[pos] = OP(p->strip[pos]) | value;
1904 }
1905
1906 /*
1907 - enlarge - enlarge the strip
1908 == static int enlarge(struct parse *p, sopno size);
1909 */
1910 static int
1911 enlarge(struct parse *p, sopno size)
1912 {
1913 sop *sp;
1914
1915 _DIAGASSERT(p != NULL);
1916
1917 if (p->ssize >= size)
1918 return 1;
1919
1920 sp = reallocarray(p->strip, size, sizeof(*p->strip));
1921 if (sp == NULL) {
1922 SETERROR(REG_ESPACE);
1923 return 0;
1924 }
1925 p->strip = sp;
1926 p->ssize = size;
1927 return 1;
1928 }
1929
1930 /*
1931 - stripsnug - compact the strip
1932 == static void stripsnug(struct parse *p, struct re_guts *g);
1933 */
1934 static void
1935 stripsnug(struct parse *p, struct re_guts *g)
1936 {
1937
1938 _DIAGASSERT(p != NULL);
1939 _DIAGASSERT(g != NULL);
1940
1941 g->nstates = p->slen;
1942 g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1943 if (g->strip == NULL) {
1944 SETERROR(REG_ESPACE);
1945 g->strip = p->strip;
1946 }
1947 }
1948
1949 /*
1950 - findmust - fill in must and mlen with longest mandatory literal string
1951 == static void findmust(struct parse *p, struct re_guts *g);
1952 *
1953 * This algorithm could do fancy things like analyzing the operands of |
1954 * for common subsequences. Someday. This code is simple and finds most
1955 * of the interesting cases.
1956 *
1957 * Note that must and mlen got initialized during setup.
1958 */
1959 static void
1960 findmust(struct parse *p, struct re_guts *g)
1961 {
1962 sop *scan;
1963 sop *start = NULL;
1964 sop *newstart = NULL;
1965 sopno newlen;
1966 sop s;
1967 char *cp;
1968 int offset;
1969 mbstate_t mbs;
1970
1971 _DIAGASSERT(p != NULL);
1972 _DIAGASSERT(g != NULL);
1973
1974 /* avoid making error situations worse */
1975 if (p->error != 0)
1976 return;
1977
1978 #ifdef notyet
1979 /*
1980 * It's not generally safe to do a ``char'' substring search on
1981 * multibyte character strings, but it's safe for at least
1982 * UTF-8 (see RFC 3629).
1983 */
1984 if (MB_CUR_MAX > 1 &&
1985 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1986 return;
1987 #endif
1988
1989 /* find the longest OCHAR sequence in strip */
1990 newlen = 0;
1991 offset = 0;
1992 g->moffset = 0;
1993 scan = g->strip + 1;
1994 do {
1995 s = *scan++;
1996 switch (OP(s)) {
1997 case OCHAR: /* sequence member */
1998 if (newlen == 0) { /* new sequence */
1999 memset(&mbs, 0, sizeof(mbs));
2000 newstart = scan - 1;
2001 }
2002 #ifdef NLS
2003 char buf[MB_LEN_MAX];
2004 size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2005 if (clen == (size_t)-1)
2006 goto toohard;
2007 newlen += (sopno)clen;
2008 #else
2009 newlen++;
2010 #endif
2011 break;
2012 case OPLUS_: /* things that don't break one */
2013 case OLPAREN:
2014 case ORPAREN:
2015 break;
2016 case OQUEST_: /* things that must be skipped */
2017 case OCH_:
2018 offset = altoffset(scan, offset);
2019 scan--;
2020 do {
2021 scan += OPND(s);
2022 s = *scan;
2023 /* assert() interferes w debug printouts */
2024 if (OP(s) != O_QUEST &&
2025 OP(s) != O_CH && OP(s) != OOR2) {
2026 g->iflags |= BAD;
2027 return;
2028 }
2029 } while (OP(s) != O_QUEST && OP(s) != O_CH);
2030 /* FALLTHROUGH */
2031 case OBOW: /* things that break a sequence */
2032 case OEOW:
2033 case OBOL:
2034 case OEOL:
2035 case OBOS:
2036 case OEOS:
2037 case OWBND:
2038 case ONWBND:
2039 case O_QUEST:
2040 case O_CH:
2041 case OEND:
2042 if (newlen > (sopno)g->mlen) { /* ends one */
2043 start = newstart;
2044 g->mlen = newlen;
2045 if (offset > -1) {
2046 g->moffset += offset;
2047 offset = newlen;
2048 } else
2049 g->moffset = offset;
2050 } else {
2051 if (offset > -1)
2052 offset += newlen;
2053 }
2054 newlen = 0;
2055 break;
2056 case OANY:
2057 if (newlen > (sopno)g->mlen) { /* ends one */
2058 start = newstart;
2059 g->mlen = newlen;
2060 if (offset > -1) {
2061 g->moffset += offset;
2062 offset = newlen;
2063 } else
2064 g->moffset = offset;
2065 } else {
2066 if (offset > -1)
2067 offset += newlen;
2068 }
2069 if (offset > -1)
2070 offset++;
2071 newlen = 0;
2072 break;
2073 case OANYOF: /* may or may not invalidate offset */
2074 /* First, everything as OANY */
2075 if (newlen > (sopno)g->mlen) { /* ends one */
2076 start = newstart;
2077 g->mlen = newlen;
2078 if (offset > -1) {
2079 g->moffset += offset;
2080 offset = newlen;
2081 } else
2082 g->moffset = offset;
2083 } else {
2084 if (offset > -1)
2085 offset += newlen;
2086 }
2087 if (offset > -1)
2088 offset++;
2089 newlen = 0;
2090 break;
2091 #ifdef NLS
2092 toohard:/*FALLTHROUGH*/
2093 #endif
2094 default:
2095 /* Anything here makes it impossible or too hard
2096 * to calculate the offset -- so we give up;
2097 * save the last known good offset, in case the
2098 * must sequence doesn't occur later.
2099 */
2100 if (newlen > (sopno)g->mlen) { /* ends one */
2101 start = newstart;
2102 g->mlen = newlen;
2103 if (offset > -1)
2104 g->moffset += offset;
2105 else
2106 g->moffset = offset;
2107 }
2108 offset = -1;
2109 newlen = 0;
2110 break;
2111 }
2112 } while (OP(s) != OEND);
2113
2114 if (g->mlen == 0) { /* there isn't one */
2115 g->moffset = -1;
2116 return;
2117 }
2118
2119 /* turn it into a character string */
2120 g->must = malloc((size_t)g->mlen + 1);
2121 if (g->must == NULL) { /* argh; just forget it */
2122 g->mlen = 0;
2123 g->moffset = -1;
2124 return;
2125 }
2126 cp = g->must;
2127 scan = start;
2128 memset(&mbs, 0, sizeof(mbs));
2129 while (cp < g->must + g->mlen) {
2130 while (OP(s = *scan++) != OCHAR)
2131 continue;
2132 #ifdef NLS
2133 size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2134 assert(clen != (size_t)-1);
2135 cp += clen;
2136 #else
2137 *cp++ = OPND(s);
2138 #endif
2139 }
2140 assert(cp == g->must + g->mlen);
2141 *cp++ = '\0'; /* just on general principles */
2142 }
2143
2144 /*
2145 - altoffset - choose biggest offset among multiple choices
2146 == static int altoffset(sop *scan, int offset);
2147 *
2148 * Compute, recursively if necessary, the largest offset among multiple
2149 * re paths.
2150 */
2151 static int
2152 altoffset(sop *scan, int offset)
2153 {
2154 int largest;
2155 int try;
2156 sop s;
2157
2158 _DIAGASSERT(scan != NULL);
2159
2160 /* If we gave up already on offsets, return */
2161 if (offset == -1)
2162 return -1;
2163
2164 largest = 0;
2165 try = 0;
2166 s = *scan++;
2167 while (OP(s) != O_QUEST && OP(s) != O_CH) {
2168 switch (OP(s)) {
2169 case OOR1:
2170 if (try > largest)
2171 largest = try;
2172 try = 0;
2173 break;
2174 case OQUEST_:
2175 case OCH_:
2176 try = altoffset(scan, try);
2177 if (try == -1)
2178 return -1;
2179 scan--;
2180 do {
2181 scan += OPND(s);
2182 s = *scan;
2183 if (OP(s) != O_QUEST &&
2184 OP(s) != O_CH && OP(s) != OOR2)
2185 return -1;
2186 } while (OP(s) != O_QUEST && OP(s) != O_CH);
2187 /* We must skip to the next position, or we'll
2188 * leave altoffset() too early.
2189 */
2190 scan++;
2191 break;
2192 case OANYOF:
2193 case OCHAR:
2194 case OANY:
2195 try++;
2196 /*FALLTHROUGH*/
2197 case OBOW:
2198 case OEOW:
2199 case OWBND:
2200 case ONWBND:
2201 case OLPAREN:
2202 case ORPAREN:
2203 case OOR2:
2204 break;
2205 default:
2206 try = -1;
2207 break;
2208 }
2209 if (try == -1)
2210 return -1;
2211 s = *scan++;
2212 }
2213
2214 if (try > largest)
2215 largest = try;
2216
2217 return largest+offset;
2218 }
2219
2220 /*
2221 - computejumps - compute char jumps for BM scan
2222 == static void computejumps(struct parse *p, struct re_guts *g);
2223 *
2224 * This algorithm assumes g->must exists and is has size greater than
2225 * zero. It's based on the algorithm found on Computer Algorithms by
2226 * Sara Baase.
2227 *
2228 * A char jump is the number of characters one needs to jump based on
2229 * the value of the character from the text that was mismatched.
2230 */
2231 static void
2232 computejumps(struct parse *p, struct re_guts *g)
2233 {
2234 int ch;
2235 size_t mindex;
2236
2237 _DIAGASSERT(p != NULL);
2238 _DIAGASSERT(g != NULL);
2239
2240 /* Avoid making errors worse */
2241 if (p->error != 0)
2242 return;
2243
2244 g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2245 if (g->charjump == NULL) /* Not a fatal error */
2246 return;
2247 /* Adjust for signed chars, if necessary */
2248 g->charjump = &g->charjump[-(CHAR_MIN)];
2249
2250 /* If the character does not exist in the pattern, the jump
2251 * is equal to the number of characters in the pattern.
2252 */
2253 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2254 g->charjump[ch] = g->mlen;
2255
2256 /* If the character does exist, compute the jump that would
2257 * take us to the last character in the pattern equal to it
2258 * (notice that we match right to left, so that last character
2259 * is the first one that would be matched).
2260 */
2261 for (mindex = 0; mindex < g->mlen; mindex++)
2262 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2263 }
2264
2265 /*
2266 - computematchjumps - compute match jumps for BM scan
2267 == static void computematchjumps(struct parse *p, struct re_guts *g);
2268 *
2269 * This algorithm assumes g->must exists and is has size greater than
2270 * zero. It's based on the algorithm found on Computer Algorithms by
2271 * Sara Baase.
2272 *
2273 * A match jump is the number of characters one needs to advance based
2274 * on the already-matched suffix.
2275 * Notice that all values here are minus (g->mlen-1), because of the way
2276 * the search algorithm works.
2277 */
2278 static void
2279 computematchjumps(struct parse *p, struct re_guts *g)
2280 {
2281 size_t mindex; /* General "must" iterator */
2282 size_t suffix; /* Keeps track of matching suffix */
2283 size_t ssuffix; /* Keeps track of suffixes' suffix */
2284 size_t* pmatches; /* pmatches[k] points to the next i
2285 * such that i+1...mlen is a substring
2286 * of k+1...k+mlen-i-1
2287 */
2288
2289 _DIAGASSERT(p != NULL);
2290 _DIAGASSERT(g != NULL);
2291
2292 /* Avoid making errors worse */
2293 if (p->error != 0)
2294 return;
2295
2296 pmatches = calloc(g->mlen, sizeof(*pmatches));
2297 if (pmatches == NULL) {
2298 g->matchjump = NULL;
2299 return;
2300 }
2301
2302 g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2303 if (g->matchjump == NULL) { /* Not a fatal error */
2304 free(pmatches);
2305 return;
2306 }
2307
2308 /* Set maximum possible jump for each character in the pattern */
2309 for (mindex = 0; mindex < g->mlen; mindex++)
2310 g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2311
2312 /* Compute pmatches[] */
2313 for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2314 pmatches[mindex] = suffix;
2315
2316 /* If a mismatch is found, interrupting the substring,
2317 * compute the matchjump for that position. If no
2318 * mismatch is found, then a text substring mismatched
2319 * against the suffix will also mismatch against the
2320 * substring.
2321 */
2322 while (suffix < g->mlen
2323 && g->must[mindex] != g->must[suffix]) {
2324 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2325 g->mlen - mindex - 1);
2326 suffix = pmatches[suffix];
2327 }
2328 }
2329
2330 /* Compute the matchjump up to the last substring found to jump
2331 * to the beginning of the largest must pattern prefix matching
2332 * it's own suffix.
2333 */
2334 for (mindex = 0; mindex <= suffix; mindex++)
2335 g->matchjump[mindex] = MIN(g->matchjump[mindex],
2336 g->mlen + suffix - mindex);
2337
2338 ssuffix = pmatches[suffix];
2339 while (suffix < g->mlen) {
2340 while (suffix <= ssuffix && suffix < g->mlen) {
2341 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2342 g->mlen + ssuffix - suffix);
2343 suffix++;
2344 }
2345 if (suffix < g->mlen)
2346 ssuffix = pmatches[ssuffix];
2347 }
2348
2349 free(pmatches);
2350 }
2351
2352 /*
2353 - pluscount - count + nesting
2354 == static sopno pluscount(struct parse *p, struct re_guts *g);
2355 */
2356 static sopno /* nesting depth */
2357 pluscount(struct parse *p, struct re_guts *g)
2358 {
2359 sop *scan;
2360 sop s;
2361 sopno plusnest = 0;
2362 sopno maxnest = 0;
2363
2364 _DIAGASSERT(p != NULL);
2365 _DIAGASSERT(g != NULL);
2366
2367 if (p->error != 0)
2368 return(0); /* there may not be an OEND */
2369
2370 scan = g->strip + 1;
2371 do {
2372 s = *scan++;
2373 switch (OP(s)) {
2374 case OPLUS_:
2375 plusnest++;
2376 break;
2377 case O_PLUS:
2378 if (plusnest > maxnest)
2379 maxnest = plusnest;
2380 plusnest--;
2381 break;
2382 }
2383 } while (OP(s) != OEND);
2384 if (plusnest != 0)
2385 g->iflags |= BAD;
2386 return(maxnest);
2387 }
2388