regcomp.c revision 1.40 1 /* $NetBSD: regcomp.c,v 1.40 2021/02/24 18:13:21 christos Exp $ */
2
3 /*-
4 * SPDX-License-Identifier: BSD-3-Clause
5 *
6 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 * Copyright (c) 1992, 1993, 1994
8 * The Regents of the University of California. All rights reserved.
9 *
10 * Copyright (c) 2011 The FreeBSD Foundation
11 * All rights reserved.
12 * Portions of this software were developed by David Chisnall
13 * under sponsorship from the FreeBSD Foundation.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * Henry Spencer.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 */
44
45 #include <sys/cdefs.h>
46 #if 0
47 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
48 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
49 #endif
50 __RCSID("$NetBSD: regcomp.c,v 1.40 2021/02/24 18:13:21 christos Exp $");
51
52 #define _OPENBSD_SOURCE
53 #define REGEX_GNU_EXTENSIONS
54
55 #include "namespace.h"
56 #include <sys/types.h>
57 #include <stdio.h>
58 #include <string.h>
59 #include <ctype.h>
60 #include <limits.h>
61 #include <stdlib.h>
62 #include <regex.h>
63 #include <stdbool.h>
64 #include <wchar.h>
65 #include <wctype.h>
66
67 #ifdef __weak_alias
68 __weak_alias(regcomp,_regcomp)
69 #endif
70
71 #ifdef REGEX_LIBC_COLLATE
72 #include "collate.h"
73 #endif
74
75 #include "utils.h"
76 #include "regex2.h"
77
78 #include "cname.h"
79
80 /*
81 * Branching context, used to keep track of branch state for all of the branch-
82 * aware functions. In addition to keeping track of branch positions for the
83 * p_branch_* functions, we use this to simplify some clumsiness in BREs for
84 * detection of whether ^ is acting as an anchor or being used erroneously and
85 * also for whether we're in a sub-expression or not.
86 */
87 struct branchc {
88 sopno start;
89 sopno back;
90 sopno fwd;
91
92 int nbranch;
93 int nchain;
94 bool outer;
95 bool terminate;
96 };
97
98 /*
99 * parse structure, passed up and down to avoid global variables and
100 * other clumsinesses
101 */
102 struct parse {
103 const char *next; /* next character in RE */
104 const char *end; /* end of string (-> NUL normally) */
105 int error; /* has an error been seen? */
106 int gnuext;
107 sop *strip; /* malloced strip */
108 sopno ssize; /* malloced strip size (allocated) */
109 sopno slen; /* malloced strip length (used) */
110 size_t ncsalloc; /* number of csets allocated */
111 struct re_guts *g;
112 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
113 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
114 sopno pend[NPAREN]; /* -> ) ([0] unused) */
115 bool allowbranch; /* can this expression branch? */
116 bool bre; /* convenience; is this a BRE? */
117 int pflags; /* other parsing flags -- legacy escapes? */
118 bool (*parse_expr)(struct parse *, struct branchc *);
119 void (*pre_parse)(struct parse *, struct branchc *);
120 void (*post_parse)(struct parse *, struct branchc *);
121 };
122
123 #define PFLAG_LEGACY_ESC 0x00000001
124
125 /* ========= begin header generated by ./mkh ========= */
126 #ifdef __cplusplus
127 extern "C" {
128 #endif
129
130 /* === regcomp.c === */
131 static bool p_ere_exp(struct parse *p, struct branchc *bc);
132 static void p_str(struct parse *p);
133 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
134 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
135 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
136 static bool p_branch_empty(struct parse *p, struct branchc *bc);
137 static bool p_branch_do(struct parse *p, struct branchc *bc);
138 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
139 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
140 static void p_re(struct parse *p, int end1, int end2);
141 static bool p_simp_re(struct parse *p, struct branchc *bc);
142 static int p_count(struct parse *p);
143 static void p_bracket(struct parse *p);
144 static int p_range_cmp(wchar_t c1, wchar_t c2);
145 static void p_b_term(struct parse *p, cset *cs);
146 #ifdef REGEX_GNU_EXTENSIONS
147 static int p_b_pseudoclass(struct parse *p, char c);
148 #endif
149 static void p_b_cclass(struct parse *p, cset *cs);
150 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
151 static void p_b_eclass(struct parse *p, cset *cs);
152 static wint_t p_b_symbol(struct parse *p);
153 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
154 static bool may_escape(struct parse *p, const wint_t ch);
155 static wint_t othercase(wint_t ch);
156 static void bothcases(struct parse *p, wint_t ch);
157 static void ordinary(struct parse *p, wint_t ch);
158 static void nonnewline(struct parse *p);
159 static void repeat(struct parse *p, sopno start, int from, int to);
160 static int seterr(struct parse *p, int e);
161 static cset *allocset(struct parse *p);
162 static void freeset(struct parse *p, cset *cs);
163 static void CHadd(struct parse *p, cset *cs, wint_t ch);
164 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
165 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
166 static wint_t singleton(cset *cs);
167 static sopno dupl(struct parse *p, sopno start, sopno finish);
168 static void doemit(struct parse *p, sop op, size_t opnd);
169 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
170 static void dofwd(struct parse *p, sopno pos, sop value);
171 static int enlarge(struct parse *p, sopno size);
172 static void stripsnug(struct parse *p, struct re_guts *g);
173 static void findmust(struct parse *p, struct re_guts *g);
174 static int altoffset(sop *scan, int offset);
175 static void computejumps(struct parse *p, struct re_guts *g);
176 static void computematchjumps(struct parse *p, struct re_guts *g);
177 static sopno pluscount(struct parse *p, struct re_guts *g);
178 static wint_t wgetnext(struct parse *p);
179
180 #ifdef __cplusplus
181 }
182 #endif
183 /* ========= end header generated by ./mkh ========= */
184
185 static char nuls[10]; /* place to point scanner in event of error */
186
187 /*
188 * macros for use with parse structure
189 * BEWARE: these know that the parse structure is named `p' !!!
190 */
191 #define PEEK() (*p->next)
192 #define PEEK2() (*(p->next+1))
193 #define MORE() (p->next < p->end)
194 #define MORE2() (p->next+1 < p->end)
195 #define SEE(c) (MORE() && PEEK() == (c))
196 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
197 #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
198 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
199 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
200 #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
201 #define NEXT() (p->next++)
202 #define NEXT2() (p->next += 2)
203 #define NEXTn(n) (p->next += (n))
204 #define GETNEXT() (*p->next++)
205 #define WGETNEXT() wgetnext(p)
206 #define SETERROR(e) seterr(p, (e))
207 #define REQUIRE(co, e) ((co) || SETERROR(e))
208 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
209 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
210 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
211 #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
212 #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
213 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
214 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
215 #define HERE() (p->slen)
216 #define THERE() (p->slen - 1)
217 #define THERETHERE() (p->slen - 2)
218 #define DROP(n) (p->slen -= (n))
219
220 /* Macro used by computejump()/computematchjump() */
221 #define MIN(a,b) ((a)<(b)?(a):(b))
222
223 static int /* 0 success, otherwise REG_something */
224 regcomp_internal(regex_t * __restrict preg,
225 const char * __restrict pattern,
226 int cflags, int pflags)
227 {
228 struct parse pa;
229 struct re_guts *g;
230 struct parse *p = &pa;
231 int i;
232 size_t len;
233 size_t maxlen;
234 #ifdef REDEBUG
235 # define GOODFLAGS(f) (f)
236 #else
237 # define GOODFLAGS(f) ((f)&~REG_DUMP)
238 #endif
239
240 _DIAGASSERT(preg != NULL);
241 _DIAGASSERT(pattern != NULL);
242
243 cflags = GOODFLAGS(cflags);
244 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
245 return(REG_INVARG);
246
247 if (cflags®_PEND) {
248 if (preg->re_endp < pattern)
249 return(REG_INVARG);
250 len = preg->re_endp - pattern;
251 } else
252 len = strlen(pattern);
253
254 /* do the mallocs early so failure handling is easy */
255 g = malloc(sizeof(*g));
256 if (g == NULL)
257 return(REG_ESPACE);
258 /*
259 * Limit the pattern space to avoid a 32-bit overflow on buffer
260 * extension. Also avoid any signed overflow in case of conversion
261 * so make the real limit based on a 31-bit overflow.
262 *
263 * Likely not applicable on 64-bit systems but handle the case
264 * generically (who are we to stop people from using ~715MB+
265 * patterns?).
266 */
267 maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
268 if (len >= maxlen) {
269 free(g);
270 return(REG_ESPACE);
271 }
272 p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
273 assert(p->ssize >= len);
274
275 p->strip = calloc(p->ssize, sizeof(*p->strip));
276 p->slen = 0;
277 if (p->strip == NULL) {
278 free(g);
279 return(REG_ESPACE);
280 }
281
282 /* set things up */
283 p->g = g;
284 p->next = pattern; /* convenience; we do not modify it */
285 p->end = p->next + len;
286 p->error = 0;
287 p->ncsalloc = 0;
288 p->pflags = pflags;
289 for (i = 0; i < NPAREN; i++) {
290 p->pbegin[i] = 0;
291 p->pend[i] = 0;
292 }
293 #ifdef REGEX_GNU_EXTENSIONS
294 if ((cflags & REG_GNU) == 0) {
295 p->gnuext = false;
296 p->allowbranch = (cflags & REG_EXTENDED) != 0;
297 } else
298 p->gnuext = p->allowbranch = true;
299 #else
300 p->gnuext = false;
301 p->allowbranch = (cflags & REG_EXTENDED) != 0;
302 #endif
303 if (cflags & REG_EXTENDED) {
304 p->bre = false;
305 p->parse_expr = p_ere_exp;
306 p->pre_parse = NULL;
307 p->post_parse = NULL;
308 } else {
309 p->bre = true;
310 p->parse_expr = p_simp_re;
311 p->pre_parse = p_bre_pre_parse;
312 p->post_parse = p_bre_post_parse;
313 }
314 g->sets = NULL;
315 g->ncsets = 0;
316 g->cflags = cflags;
317 g->iflags = 0;
318 g->nbol = 0;
319 g->neol = 0;
320 g->must = NULL;
321 g->moffset = -1;
322 g->charjump = NULL;
323 g->matchjump = NULL;
324 g->mlen = 0;
325 g->nsub = 0;
326 g->backrefs = 0;
327
328 /* do it */
329 EMIT(OEND, 0);
330 g->firststate = THERE();
331 if (cflags & REG_NOSPEC)
332 p_str(p);
333 else
334 p_re(p, OUT, OUT);
335 EMIT(OEND, 0);
336 g->laststate = THERE();
337
338 /* tidy up loose ends and fill things in */
339 stripsnug(p, g);
340 findmust(p, g);
341 /* only use Boyer-Moore algorithm if the pattern is bigger
342 * than three characters
343 */
344 if(g->mlen > 3) {
345 computejumps(p, g);
346 computematchjumps(p, g);
347 if(g->matchjump == NULL && g->charjump != NULL) {
348 free(g->charjump);
349 g->charjump = NULL;
350 }
351 }
352 g->nplus = pluscount(p, g);
353 g->magic = MAGIC2;
354 preg->re_nsub = g->nsub;
355 preg->re_g = g;
356 preg->re_magic = MAGIC1;
357 #ifndef REDEBUG
358 /* not debugging, so can't rely on the assert() in regexec() */
359 if (g->iflags&BAD)
360 SETERROR(REG_ASSERT);
361 #endif
362
363 /* win or lose, we're done */
364 if (p->error != 0) /* lose */
365 regfree(preg);
366 return(p->error);
367 }
368
369 /*
370 - regcomp - interface for parser and compilation
371 = extern int regcomp(regex_t *, const char *, int);
372 = #define REG_BASIC 0000
373 = #define REG_EXTENDED 0001
374 = #define REG_ICASE 0002
375 = #define REG_NOSUB 0004
376 = #define REG_NEWLINE 0010
377 = #define REG_NOSPEC 0020
378 = #define REG_PEND 0040
379 = #define REG_DUMP 0200
380 */
381 int /* 0 success, otherwise REG_something */
382 regcomp(regex_t * __restrict preg,
383 const char * __restrict pattern,
384 int cflags)
385 {
386
387 return (regcomp_internal(preg, pattern, cflags, 0));
388 }
389
390 /*
391 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
392 - return whether we should terminate or not
393 == static bool p_ere_exp(struct parse *p);
394 */
395 static bool
396 p_ere_exp(struct parse *p, struct branchc *bc)
397 {
398 char c;
399 wint_t wc;
400 sopno pos;
401 int count;
402 int count2;
403 #ifdef REGEX_GNU_EXTENSIONS
404 size_t i;
405 int handled;
406 #endif
407 sopno subno;
408 int wascaret = 0;
409
410 _DIAGASSERT(p != NULL);
411
412 (void)bc;
413 assert(MORE()); /* caller should have ensured this */
414 c = GETNEXT();
415
416 #ifdef REGEX_GNU_EXTENSIONS
417 handled = 0;
418 #endif
419 pos = HERE();
420 switch (c) {
421 case '(':
422 (void)REQUIRE(MORE(), REG_EPAREN);
423 p->g->nsub++;
424 subno = (sopno)p->g->nsub;
425 if (subno < NPAREN)
426 p->pbegin[subno] = HERE();
427 EMIT(OLPAREN, subno);
428 if (!SEE(')'))
429 p_re(p, ')', IGN);
430 if (subno < NPAREN) {
431 p->pend[subno] = HERE();
432 assert(p->pend[subno] != 0);
433 }
434 EMIT(ORPAREN, subno);
435 (void)MUSTEAT(')', REG_EPAREN);
436 break;
437 #ifndef POSIX_MISTAKE
438 case ')': /* happens only if no current unmatched ( */
439 /*
440 * You may ask, why the ifndef? Because I didn't notice
441 * this until slightly too late for 1003.2, and none of the
442 * other 1003.2 regular-expression reviewers noticed it at
443 * all. So an unmatched ) is legal POSIX, at least until
444 * we can get it fixed.
445 */
446 SETERROR(REG_EPAREN);
447 break;
448 #endif
449 case '^':
450 EMIT(OBOL, 0);
451 p->g->iflags |= USEBOL;
452 p->g->nbol++;
453 wascaret = 1;
454 break;
455 case '$':
456 EMIT(OEOL, 0);
457 p->g->iflags |= USEEOL;
458 p->g->neol++;
459 break;
460 case '|':
461 SETERROR(REG_EMPTY);
462 break;
463 case '*':
464 case '+':
465 case '?':
466 case '{':
467 SETERROR(REG_BADRPT);
468 break;
469 case '.':
470 if (p->g->cflags®_NEWLINE)
471 nonnewline(p);
472 else
473 EMIT(OANY, 0);
474 break;
475 case '[':
476 p_bracket(p);
477 break;
478 case '\\':
479 (void)REQUIRE(MORE(), REG_EESCAPE);
480 wc = WGETNEXT();
481 #ifdef REGEX_GNU_EXTENSIONS
482 if (p->gnuext) {
483 handled = 1;
484 switch (wc) {
485 case '`':
486 EMIT(OBOS, 0);
487 break;
488 case '\'':
489 EMIT(OEOS, 0);
490 break;
491 case 'B':
492 EMIT(ONWBND, 0);
493 break;
494 case 'b':
495 EMIT(OWBND, 0);
496 break;
497 case 'W':
498 case 'w':
499 case 'S':
500 case 's':
501 p_b_pseudoclass(p, wc);
502 break;
503 case '1':
504 case '2':
505 case '3':
506 case '4':
507 case '5':
508 case '6':
509 case '7':
510 case '8':
511 case '9':
512 i = wc - '0';
513 assert(i < NPAREN);
514 if (p->pend[i] != 0) {
515 assert(i <= p->g->nsub);
516 EMIT(OBACK_, i);
517 assert(p->pbegin[i] != 0);
518 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
519 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
520 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
521 EMIT(O_BACK, i);
522 } else
523 SETERROR(REG_ESUBREG);
524 p->g->backrefs = 1;
525 break;
526 default:
527 handled = 0;
528 }
529 /* Don't proceed to the POSIX bits if we've already handled it */
530 if (handled)
531 break;
532 }
533 #endif
534 switch (wc) {
535 case '<':
536 EMIT(OBOW, 0);
537 break;
538 case '>':
539 EMIT(OEOW, 0);
540 break;
541 default:
542 if (may_escape(p, wc))
543 ordinary(p, wc);
544 else
545 SETERROR(REG_EESCAPE);
546 break;
547 }
548 break;
549 default:
550 if (p->error != 0)
551 return (false);
552 p->next--;
553 wc = WGETNEXT();
554 ordinary(p, wc);
555 break;
556 }
557
558 if (!MORE())
559 return (false);
560 c = PEEK();
561 /* we call { a repetition if followed by a digit */
562 if (!( c == '*' || c == '+' || c == '?' || c == '{'))
563 return (false); /* no repetition, we're done */
564 else if (c == '{')
565 (void)REQUIRE(MORE2() && \
566 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
567 NEXT();
568
569 (void)REQUIRE(!wascaret, REG_BADRPT);
570 switch (c) {
571 case '*': /* implemented as +? */
572 /* this case does not require the (y|) trick, noKLUDGE */
573 INSERT(OPLUS_, pos);
574 ASTERN(O_PLUS, pos);
575 INSERT(OQUEST_, pos);
576 ASTERN(O_QUEST, pos);
577 break;
578 case '+':
579 INSERT(OPLUS_, pos);
580 ASTERN(O_PLUS, pos);
581 break;
582 case '?':
583 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
584 INSERT(OCH_, pos); /* offset slightly wrong */
585 ASTERN(OOR1, pos); /* this one's right */
586 AHEAD(pos); /* fix the OCH_ */
587 EMIT(OOR2, 0); /* offset very wrong... */
588 AHEAD(THERE()); /* ...so fix it */
589 ASTERN(O_CH, THERETHERE());
590 break;
591 case '{':
592 count = p_count(p);
593 if (EAT(',')) {
594 if (isdigit((uch)PEEK())) {
595 count2 = p_count(p);
596 (void)REQUIRE(count <= count2, REG_BADBR);
597 } else /* single number with comma */
598 count2 = INFINITY;
599 } else /* just a single number */
600 count2 = count;
601 repeat(p, pos, count, count2);
602 if (!EAT('}')) { /* error heuristics */
603 while (MORE() && PEEK() != '}')
604 NEXT();
605 (void)REQUIRE(MORE(), REG_EBRACE);
606 SETERROR(REG_BADBR);
607 }
608 break;
609 }
610
611 if (!MORE())
612 return (false);
613 c = PEEK();
614 if (!( c == '*' || c == '+' || c == '?' ||
615 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
616 return (false);
617 SETERROR(REG_BADRPT);
618 return (false);
619 }
620
621 /*
622 - p_str - string (no metacharacters) "parser"
623 == static void p_str(struct parse *p);
624 */
625 static void
626 p_str(struct parse *p)
627 {
628 (void)REQUIRE(MORE(), REG_EMPTY);
629 while (MORE())
630 ordinary(p, WGETNEXT());
631 }
632
633 /*
634 * Eat consecutive branch delimiters for the kind of expression that we are
635 * parsing, return the number of delimiters that we ate.
636 */
637 static int
638 p_branch_eat_delim(struct parse *p, struct branchc *bc)
639 {
640 int nskip;
641
642 (void)bc;
643 nskip = 0;
644 while (EATSPEC('|'))
645 ++nskip;
646 return (nskip);
647 }
648
649 /*
650 * Insert necessary branch book-keeping operations. This emits a
651 * bogus 'next' offset, since we still have more to parse
652 */
653 static void
654 p_branch_ins_offset(struct parse *p, struct branchc *bc)
655 {
656
657 if (bc->nbranch == 0) {
658 INSERT(OCH_, bc->start); /* offset is wrong */
659 bc->fwd = bc->start;
660 bc->back = bc->start;
661 }
662
663 ASTERN(OOR1, bc->back);
664 bc->back = THERE();
665 AHEAD(bc->fwd); /* fix previous offset */
666 bc->fwd = HERE();
667 EMIT(OOR2, 0); /* offset is very wrong */
668 ++bc->nbranch;
669 }
670
671 /*
672 * Fix the offset of the tail branch, if we actually had any branches.
673 * This is to correct the bogus placeholder offset that we use.
674 */
675 static void
676 p_branch_fix_tail(struct parse *p, struct branchc *bc)
677 {
678
679 /* Fix bogus offset at the tail if we actually have branches */
680 if (bc->nbranch > 0) {
681 AHEAD(bc->fwd);
682 ASTERN(O_CH, bc->back);
683 }
684 }
685
686 /*
687 * Signal to the parser that an empty branch has been encountered; this will,
688 * in the future, be used to allow for more permissive behavior with empty
689 * branches. The return value should indicate whether parsing may continue
690 * or not.
691 */
692 static bool
693 p_branch_empty(struct parse *p, struct branchc *bc)
694 {
695
696 (void)bc;
697 SETERROR(REG_EMPTY);
698 return (false);
699 }
700
701 /*
702 * Take care of any branching requirements. This includes inserting the
703 * appropriate branching instructions as well as eating all of the branch
704 * delimiters until we either run out of pattern or need to parse more pattern.
705 */
706 static bool
707 p_branch_do(struct parse *p, struct branchc *bc)
708 {
709 int ate = 0;
710
711 ate = p_branch_eat_delim(p, bc);
712 if (ate == 0)
713 return (false);
714 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
715 /*
716 * Halt parsing only if we have an empty branch and p_branch_empty
717 * indicates that we must not continue. In the future, this will not
718 * necessarily be an error.
719 */
720 return (false);
721 p_branch_ins_offset(p, bc);
722
723 return (true);
724 }
725
726 static void
727 p_bre_pre_parse(struct parse *p, struct branchc *bc)
728 {
729
730 (void)bc;
731 /*
732 * Does not move cleanly into expression parser because of
733 * ordinary interpration of * at the beginning position of
734 * an expression.
735 */
736 if (EAT('^')) {
737 EMIT(OBOL, 0);
738 p->g->iflags |= USEBOL;
739 p->g->nbol++;
740 }
741 }
742
743 static void
744 p_bre_post_parse(struct parse *p, struct branchc *bc)
745 {
746
747 /* Expression is terminating due to EOL token */
748 if (bc->terminate) {
749 DROP(1);
750 EMIT(OEOL, 0);
751 p->g->iflags |= USEEOL;
752 p->g->neol++;
753 }
754 }
755
756 /*
757 - p_re - Top level parser, concatenation and BRE anchoring
758 == static void p_re(struct parse *p, int end1, int end2);
759 * Giving end1 as OUT essentially eliminates the end1/end2 check.
760 *
761 * This implementation is a bit of a kludge, in that a trailing $ is first
762 * taken as an ordinary character and then revised to be an anchor.
763 * The amount of lookahead needed to avoid this kludge is excessive.
764 */
765 static void
766 p_re(struct parse *p,
767 int end1, /* first terminating character */
768 int end2) /* second terminating character; ignored for EREs */
769 {
770 struct branchc bc;
771
772 bc.nbranch = 0;
773 if (end1 == OUT && end2 == OUT)
774 bc.outer = true;
775 else
776 bc.outer = false;
777 #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
778 for (;;) {
779 bc.start = HERE();
780 bc.nchain = 0;
781 bc.terminate = false;
782 if (p->pre_parse != NULL)
783 p->pre_parse(p, &bc);
784 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
785 bc.terminate = p->parse_expr(p, &bc);
786 ++bc.nchain;
787 }
788 if (p->post_parse != NULL)
789 p->post_parse(p, &bc);
790 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
791 #ifdef REGEX_GNU_EXTENSIONS
792 if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
793 break;
794 #endif
795 if (!p->allowbranch)
796 break;
797 /*
798 * p_branch_do's return value indicates whether we should
799 * continue parsing or not. This is both for correctness and
800 * a slight optimization, because it will check if we've
801 * encountered an empty branch or the end of the string
802 * immediately following a branch delimiter.
803 */
804 if (!p_branch_do(p, &bc))
805 break;
806 }
807 #undef SEE_END
808 if (p->allowbranch)
809 p_branch_fix_tail(p, &bc);
810 assert(!MORE() || SEE(end1));
811 }
812
813 /*
814 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
815 == static bool p_simp_re(struct parse *p, struct branchc *bc);
816 */
817 static bool /* was the simple RE an unbackslashed $? */
818 p_simp_re(struct parse *p, struct branchc *bc)
819 {
820 int c;
821 int cc; /* convenient/control character */
822 int count;
823 int count2;
824 sopno pos;
825 bool handled;
826 size_t i;
827 wint_t wc;
828 sopno subno;
829 # define BACKSL (1<<CHAR_BIT)
830
831 pos = HERE(); /* repetition op, if any, covers from here */
832 handled = false;
833
834 assert(MORE()); /* caller should have ensured this */
835 c = GETNEXT();
836 if (c == '\\') {
837 (void)REQUIRE(MORE(), REG_EESCAPE);
838 cc = GETNEXT();
839 c = BACKSL | cc;
840 #ifdef REGEX_GNU_EXTENSIONS
841 if (p->gnuext) {
842 handled = true;
843 switch (c) {
844 case BACKSL|'`':
845 EMIT(OBOS, 0);
846 break;
847 case BACKSL|'\'':
848 EMIT(OEOS, 0);
849 break;
850 case BACKSL|'B':
851 EMIT(ONWBND, 0);
852 break;
853 case BACKSL|'b':
854 EMIT(OWBND, 0);
855 break;
856 case BACKSL|'W':
857 case BACKSL|'w':
858 case BACKSL|'S':
859 case BACKSL|'s':
860 p_b_pseudoclass(p, cc);
861 break;
862 default:
863 handled = false;
864 }
865 }
866 #endif
867 }
868 if (!handled) {
869 switch (c) {
870 case '.':
871 if (p->g->cflags®_NEWLINE)
872 nonnewline(p);
873 else
874 EMIT(OANY, 0);
875 break;
876 case '[':
877 p_bracket(p);
878 break;
879 case BACKSL|'<':
880 EMIT(OBOW, 0);
881 break;
882 case BACKSL|'>':
883 EMIT(OEOW, 0);
884 break;
885 case BACKSL|'{':
886 SETERROR(REG_BADRPT);
887 break;
888 case BACKSL|'(':
889 p->g->nsub++;
890 subno = (sopno)p->g->nsub;
891 if (subno < NPAREN)
892 p->pbegin[subno] = HERE();
893 EMIT(OLPAREN, subno);
894 /* the MORE here is an error heuristic */
895 if (MORE() && !SEETWO('\\', ')'))
896 p_re(p, '\\', ')');
897 if (subno < NPAREN) {
898 p->pend[subno] = HERE();
899 assert(p->pend[subno] != 0);
900 }
901 EMIT(ORPAREN, subno);
902 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
903 break;
904 case BACKSL|')': /* should not get here -- must be user */
905 SETERROR(REG_EPAREN);
906 break;
907 case BACKSL|'1':
908 case BACKSL|'2':
909 case BACKSL|'3':
910 case BACKSL|'4':
911 case BACKSL|'5':
912 case BACKSL|'6':
913 case BACKSL|'7':
914 case BACKSL|'8':
915 case BACKSL|'9':
916 i = (c&~BACKSL) - '0';
917 assert(i < NPAREN);
918 if (p->pend[i] != 0) {
919 assert(i <= p->g->nsub);
920 EMIT(OBACK_, i);
921 assert(p->pbegin[i] != 0);
922 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
923 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
924 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
925 EMIT(O_BACK, i);
926 } else
927 SETERROR(REG_ESUBREG);
928 p->g->backrefs = 1;
929 break;
930 case '*':
931 /*
932 * Ordinary if used as the first character beyond BOL anchor of
933 * a (sub-)expression, counts as a bad repetition operator if it
934 * appears otherwise.
935 */
936 (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
937 /* FALLTHROUGH */
938 default:
939 if (p->error != 0)
940 return (false); /* Definitely not $... */
941 p->next--;
942 wc = WGETNEXT();
943 if ((c & BACKSL) == 0 || may_escape(p, wc))
944 ordinary(p, wc);
945 else
946 SETERROR(REG_EESCAPE);
947 break;
948 }
949 }
950
951 if (EAT('*')) { /* implemented as +? */
952 /* this case does not require the (y|) trick, noKLUDGE */
953 INSERT(OPLUS_, pos);
954 ASTERN(O_PLUS, pos);
955 INSERT(OQUEST_, pos);
956 ASTERN(O_QUEST, pos);
957 #ifdef REGEX_GNU_EXTENSIONS
958 } else if (p->gnuext && EATTWO('\\', '?')) {
959 INSERT(OQUEST_, pos);
960 ASTERN(O_QUEST, pos);
961 } else if (p->gnuext && EATTWO('\\', '+')) {
962 INSERT(OPLUS_, pos);
963 ASTERN(O_PLUS, pos);
964 #endif
965 } else if (EATTWO('\\', '{')) {
966 count = p_count(p);
967 if (EAT(',')) {
968 if (MORE() && isdigit((uch)PEEK())) {
969 count2 = p_count(p);
970 (void)REQUIRE(count <= count2, REG_BADBR);
971 } else /* single number with comma */
972 count2 = INFINITY;
973 } else /* just a single number */
974 count2 = count;
975 repeat(p, pos, count, count2);
976 if (!EATTWO('\\', '}')) { /* error heuristics */
977 while (MORE() && !SEETWO('\\', '}'))
978 NEXT();
979 (void)REQUIRE(MORE(), REG_EBRACE);
980 SETERROR(REG_BADBR);
981 }
982 } else if (c == '$') /* $ (but not \$) ends it */
983 return (true);
984
985 return (false);
986 }
987
988 /*
989 - p_count - parse a repetition count
990 == static int p_count(struct parse *p);
991 */
992 static int /* the value */
993 p_count(struct parse *p)
994 {
995 int count = 0;
996 int ndigits = 0;
997
998 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
999 count = count*10 + (GETNEXT() - '0');
1000 ndigits++;
1001 }
1002
1003 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1004 return(count);
1005 }
1006
1007 /*
1008 - p_bracket - parse a bracketed character list
1009 == static void p_bracket(struct parse *p);
1010 */
1011 static void
1012 p_bracket(struct parse *p)
1013 {
1014 cset *cs;
1015 wint_t ch;
1016
1017 /* Dept of Truly Sickening Special-Case Kludges */
1018 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1019 EMIT(OBOW, 0);
1020 NEXTn(6);
1021 return;
1022 }
1023 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1024 EMIT(OEOW, 0);
1025 NEXTn(6);
1026 return;
1027 }
1028
1029 if ((cs = allocset(p)) == NULL)
1030 return;
1031
1032 if (p->g->cflags®_ICASE)
1033 cs->icase = 1;
1034 if (EAT('^'))
1035 cs->invert = 1;
1036 if (EAT(']'))
1037 CHadd(p, cs, ']');
1038 else if (EAT('-'))
1039 CHadd(p, cs, '-');
1040 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1041 p_b_term(p, cs);
1042 if (EAT('-'))
1043 CHadd(p, cs, '-');
1044 (void)MUSTEAT(']', REG_EBRACK);
1045
1046 if (p->error != 0) /* don't mess things up further */
1047 return;
1048
1049 if (cs->invert && p->g->cflags®_NEWLINE)
1050 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1051
1052 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1053 ordinary(p, ch);
1054 freeset(p, cs);
1055 } else
1056 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1057 }
1058
1059 static int
1060 p_range_cmp(wchar_t c1, wchar_t c2)
1061 {
1062 #ifdef REGEX_LIBC_COLLATE
1063 return __wcollate_range_cmp(c1, c2);
1064 #else
1065 /* Copied from libc/collate __wcollate_range_cmp */
1066 wchar_t s1[2], s2[2];
1067
1068 s1[0] = c1;
1069 s1[1] = L'\0';
1070 s2[0] = c2;
1071 s2[1] = L'\0';
1072 return (wcscoll(s1, s2));
1073 #endif
1074 }
1075
1076 /*
1077 - p_b_term - parse one term of a bracketed character list
1078 == static void p_b_term(struct parse *p, cset *cs);
1079 */
1080 static void
1081 p_b_term(struct parse *p, cset *cs)
1082 {
1083 char c;
1084 wint_t start, finish;
1085 wint_t i;
1086 #ifdef REGEX_LIBC_COLLATE
1087 struct xlocale_collate *table =
1088 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1089 #endif
1090
1091 _DIAGASSERT(p != NULL);
1092 _DIAGASSERT(cs != NULL);
1093
1094 /* classify what we've got */
1095 switch ((MORE()) ? PEEK() : '\0') {
1096 case '[':
1097 c = (MORE2()) ? PEEK2() : '\0';
1098 break;
1099 case '-':
1100 SETERROR(REG_ERANGE);
1101 return; /* NOTE RETURN */
1102 default:
1103 c = '\0';
1104 break;
1105 }
1106
1107 switch (c) {
1108 case ':': /* character class */
1109 NEXT2();
1110 (void)REQUIRE(MORE(), REG_EBRACK);
1111 c = PEEK();
1112 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1113 p_b_cclass(p, cs);
1114 (void)REQUIRE(MORE(), REG_EBRACK);
1115 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1116 break;
1117 case '=': /* equivalence class */
1118 NEXT2();
1119 (void)REQUIRE(MORE(), REG_EBRACK);
1120 c = PEEK();
1121 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1122 p_b_eclass(p, cs);
1123 (void)REQUIRE(MORE(), REG_EBRACK);
1124 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1125 break;
1126 default: /* symbol, ordinary character, or range */
1127 start = p_b_symbol(p);
1128 if (SEE('-') && MORE2() && PEEK2() != ']') {
1129 /* range */
1130 NEXT();
1131 if (EAT('-'))
1132 finish = '-';
1133 else
1134 finish = p_b_symbol(p);
1135 } else
1136 finish = start;
1137 if (start == finish)
1138 CHadd(p, cs, start);
1139 else {
1140 #ifdef REGEX_LIBC_COLLATE
1141 if (table->__collate_load_error || MB_CUR_MAX > 1) {
1142 #else
1143 if (MB_CUR_MAX > 1) {
1144 #endif
1145 (void)REQUIRE(start <= finish, REG_ERANGE);
1146 CHaddrange(p, cs, start, finish);
1147 } else {
1148 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1149 for (i = 0; i <= UCHAR_MAX; i++) {
1150 if (p_range_cmp(start, i) <= 0 &&
1151 p_range_cmp(i, finish) <= 0 )
1152 CHadd(p, cs, i);
1153 }
1154 }
1155 }
1156 break;
1157 }
1158 }
1159
1160 #ifdef REGEX_GNU_EXTENSIONS
1161 /*
1162 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1163 == static int p_b_pseudoclass(struct parse *p, char c)
1164 */
1165 static int
1166 p_b_pseudoclass(struct parse *p, char c) {
1167 cset *cs;
1168
1169 if ((cs = allocset(p)) == NULL)
1170 return(0);
1171
1172 if (p->g->cflags®_ICASE)
1173 cs->icase = 1;
1174
1175 switch (c) {
1176 case 'W':
1177 cs->invert = 1;
1178 /* FALLTHROUGH */
1179 case 'w':
1180 p_b_cclass_named(p, cs, "alnum");
1181 break;
1182 case 'S':
1183 cs->invert = 1;
1184 /* FALLTHROUGH */
1185 case 's':
1186 p_b_cclass_named(p, cs, "space");
1187 break;
1188 default:
1189 return(0);
1190 }
1191
1192 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1193 return(1);
1194 }
1195 #endif
1196
1197 /*
1198 - p_b_cclass - parse a character-class name and deal with it
1199 == static void p_b_cclass(struct parse *p, cset *cs);
1200 */
1201 static void
1202 p_b_cclass(struct parse *p, cset *cs)
1203 {
1204 const char *sp = p->next;
1205 size_t len;
1206 char clname[16];
1207
1208 while (MORE() && isalpha((uch)PEEK()))
1209 NEXT();
1210 len = p->next - sp;
1211 if (len >= sizeof(clname) - 1) {
1212 SETERROR(REG_ECTYPE);
1213 return;
1214 }
1215 memcpy(clname, sp, len);
1216 clname[len] = '\0';
1217
1218 p_b_cclass_named(p, cs, clname);
1219 }
1220 /*
1221 - p_b_cclass_named - deal with a named character class
1222 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1223 */
1224 static void
1225 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1226 wctype_t wct;
1227
1228 if ((wct = wctype(clname)) == 0) {
1229 SETERROR(REG_ECTYPE);
1230 return;
1231 }
1232 CHaddtype(p, cs, wct);
1233 }
1234
1235 /*
1236 - p_b_eclass - parse an equivalence-class name and deal with it
1237 == static void p_b_eclass(struct parse *p, cset *cs);
1238 *
1239 * This implementation is incomplete. xxx
1240 */
1241 static void
1242 p_b_eclass(struct parse *p, cset *cs)
1243 {
1244 wint_t c;
1245
1246 _DIAGASSERT(p != NULL);
1247 _DIAGASSERT(cs != NULL);
1248
1249 c = p_b_coll_elem(p, '=');
1250 CHadd(p, cs, c);
1251 }
1252
1253 /*
1254 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1255 == static wint_t p_b_symbol(struct parse *p);
1256 */
1257 static wint_t /* value of symbol */
1258 p_b_symbol(struct parse *p)
1259 {
1260 wint_t value;
1261
1262 _DIAGASSERT(p != NULL);
1263
1264 (void)REQUIRE(MORE(), REG_EBRACK);
1265 if (!EATTWO('[', '.'))
1266 return(WGETNEXT());
1267
1268 /* collating symbol */
1269 value = p_b_coll_elem(p, '.');
1270 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1271 return(value);
1272 }
1273
1274 /*
1275 - p_b_coll_elem - parse a collating-element name and look it up
1276 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1277 */
1278 static wint_t /* value of collating element */
1279 p_b_coll_elem(struct parse *p,
1280 wint_t endc) /* name ended by endc,']' */
1281 {
1282 const char *sp = p->next;
1283 struct cname *cp;
1284 mbstate_t mbs;
1285 wchar_t wc;
1286 size_t clen, len;
1287
1288 _DIAGASSERT(p != NULL);
1289
1290 while (MORE() && !SEETWO(endc, ']'))
1291 NEXT();
1292 if (!MORE()) {
1293 SETERROR(REG_EBRACK);
1294 return(0);
1295 }
1296 len = p->next - sp;
1297 for (cp = cnames; cp->name != NULL; cp++)
1298 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1299 return(cp->code); /* known name */
1300 memset(&mbs, 0, sizeof(mbs));
1301 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1302 return (wc); /* single character */
1303 else if (clen == (size_t)-1 || clen == (size_t)-2)
1304 SETERROR(REG_ILLSEQ);
1305 else
1306 SETERROR(REG_ECOLLATE); /* neither */
1307 return(0);
1308 }
1309
1310 /*
1311 - may_escape - determine whether 'ch' is escape-able in the current context
1312 == static int may_escape(struct parse *p, const wint_t ch)
1313 */
1314 static bool
1315 may_escape(struct parse *p, const wint_t ch)
1316 {
1317
1318 if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1319 return (true);
1320 if (isalpha(ch) || ch == '\'' || ch == '`')
1321 return (false);
1322 return (true);
1323 #ifdef NOTYET
1324 /*
1325 * Build a whitelist of characters that may be escaped to produce an
1326 * ordinary in the current context. This assumes that these have not
1327 * been otherwise interpreted as a special character. Escaping an
1328 * ordinary character yields undefined results according to
1329 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1330 * advantage of this and use escaped ordinary characters to provide
1331 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1332 */
1333 switch(ch) {
1334 case '|':
1335 case '+':
1336 case '?':
1337 /* The above characters may not be escaped in BREs */
1338 if (!(p->g->cflags®_EXTENDED))
1339 return (false);
1340 /* Fallthrough */
1341 case '(':
1342 case ')':
1343 case '{':
1344 case '}':
1345 case '.':
1346 case '[':
1347 case ']':
1348 case '\\':
1349 case '*':
1350 case '^':
1351 case '$':
1352 return (true);
1353 default:
1354 return (false);
1355 }
1356 #endif
1357 }
1358
1359 /*
1360 - othercase - return the case counterpart of an alphabetic
1361 == static wint_t othercase(wint_t ch);
1362 */
1363 static wint_t /* if no counterpart, return ch */
1364 othercase(wint_t ch)
1365 {
1366 assert(iswalpha(ch));
1367 if (iswupper(ch))
1368 return(towlower(ch));
1369 else if (iswlower(ch))
1370 return(towupper(ch));
1371 else /* peculiar, but could happen */
1372 return(ch);
1373 }
1374
1375 /*
1376 - bothcases - emit a dualcase version of a two-case character
1377 == static void bothcases(struct parse *p, wint_t ch);
1378 *
1379 * Boy, is this implementation ever a kludge...
1380 */
1381 static void
1382 bothcases(struct parse *p, wint_t ch)
1383 {
1384 const char *oldnext = p->next;
1385 const char *oldend = p->end;
1386 char bracket[3 + MB_LEN_MAX];
1387 size_t n;
1388 mbstate_t mbs;
1389
1390 _DIAGASSERT(p != NULL);
1391
1392 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1393 p->next = bracket;
1394 memset(&mbs, 0, sizeof(mbs));
1395 n = wcrtomb(bracket, ch, &mbs);
1396 assert(n != (size_t)-1);
1397 bracket[n] = ']';
1398 bracket[n + 1] = '\0';
1399 p->end = bracket+n+1;
1400 p_bracket(p);
1401 assert(p->next == p->end);
1402 p->next = oldnext;
1403 p->end = oldend;
1404 }
1405
1406 /*
1407 - ordinary - emit an ordinary character
1408 == static void ordinary(struct parse *p, wint_t ch);
1409 */
1410 static void
1411 ordinary(struct parse *p, wint_t ch)
1412 {
1413 cset *cs;
1414
1415 _DIAGASSERT(p != NULL);
1416
1417 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1418 bothcases(p, ch);
1419 else if ((wint_t)(ch & OPDMASK) == ch)
1420 EMIT(OCHAR, (size_t)ch);
1421 else {
1422 /*
1423 * Kludge: character is too big to fit into an OCHAR operand.
1424 * Emit a singleton set.
1425 */
1426 if ((cs = allocset(p)) == NULL)
1427 return;
1428 CHadd(p, cs, ch);
1429 EMIT(OANYOF, (size_t)(cs - p->g->sets));
1430 }
1431 }
1432
1433 /*
1434 - nonnewline - emit REG_NEWLINE version of OANY
1435 == static void nonnewline(struct parse *p);
1436 *
1437 * Boy, is this implementation ever a kludge...
1438 */
1439 static void
1440 nonnewline(struct parse *p)
1441 {
1442 const char *oldnext = p->next;
1443 const char *oldend = p->end;
1444 char bracket[4];
1445
1446 _DIAGASSERT(p != NULL);
1447
1448 p->next = bracket;
1449 p->end = bracket+3;
1450 bracket[0] = '^';
1451 bracket[1] = '\n';
1452 bracket[2] = ']';
1453 bracket[3] = '\0';
1454 p_bracket(p);
1455 assert(p->next == bracket+3);
1456 p->next = oldnext;
1457 p->end = oldend;
1458 }
1459
1460 /*
1461 - repeat - generate code for a bounded repetition, recursively if needed
1462 == static void repeat(struct parse *p, sopno start, int from, int to);
1463 */
1464 static void
1465 repeat(struct parse *p,
1466 sopno start, /* operand from here to end of strip */
1467 int from, /* repeated from this number */
1468 int to) /* to this number of times (maybe INFINITY) */
1469 {
1470 sopno finish = HERE();
1471 # define N 2
1472 # define INF 3
1473 # define REP(f, t) ((f)*8 + (t))
1474 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1475 sopno copy;
1476
1477 _DIAGASSERT(p != NULL);
1478
1479 if (p->error != 0) /* head off possible runaway recursion */
1480 return;
1481
1482 assert(from <= to);
1483
1484 switch (REP(MAP(from), MAP(to))) {
1485 case REP(0, 0): /* must be user doing this */
1486 DROP(finish-start); /* drop the operand */
1487 break;
1488 case REP(0, 1): /* as x{1,1}? */
1489 case REP(0, N): /* as x{1,n}? */
1490 case REP(0, INF): /* as x{1,}? */
1491 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1492 INSERT(OCH_, start); /* offset is wrong... */
1493 repeat(p, start+1, 1, to);
1494 ASTERN(OOR1, start);
1495 AHEAD(start); /* ... fix it */
1496 EMIT(OOR2, 0);
1497 AHEAD(THERE());
1498 ASTERN(O_CH, THERETHERE());
1499 break;
1500 case REP(1, 1): /* trivial case */
1501 /* done */
1502 break;
1503 case REP(1, N): /* as x?x{1,n-1} */
1504 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1505 INSERT(OCH_, start);
1506 ASTERN(OOR1, start);
1507 AHEAD(start);
1508 EMIT(OOR2, 0); /* offset very wrong... */
1509 AHEAD(THERE()); /* ...so fix it */
1510 ASTERN(O_CH, THERETHERE());
1511 copy = dupl(p, start+1, finish+1);
1512 assert(copy == finish+4);
1513 repeat(p, copy, 1, to-1);
1514 break;
1515 case REP(1, INF): /* as x+ */
1516 INSERT(OPLUS_, start);
1517 ASTERN(O_PLUS, start);
1518 break;
1519 case REP(N, N): /* as xx{m-1,n-1} */
1520 copy = dupl(p, start, finish);
1521 repeat(p, copy, from-1, to-1);
1522 break;
1523 case REP(N, INF): /* as xx{n-1,INF} */
1524 copy = dupl(p, start, finish);
1525 repeat(p, copy, from-1, to);
1526 break;
1527 default: /* "can't happen" */
1528 SETERROR(REG_ASSERT); /* just in case */
1529 break;
1530 }
1531 }
1532
1533 /*
1534 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1535 - character from the parse struct, signals a REG_ILLSEQ error if the
1536 - character can't be converted. Returns the number of bytes consumed.
1537 */
1538 static wint_t
1539 wgetnext(struct parse *p)
1540 {
1541 mbstate_t mbs;
1542 wchar_t wc;
1543 size_t n;
1544
1545 memset(&mbs, 0, sizeof(mbs));
1546 n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1547 if (n == (size_t)-1 || n == (size_t)-2) {
1548 SETERROR(REG_ILLSEQ);
1549 return (0);
1550 }
1551 if (n == 0)
1552 n = 1;
1553 p->next += n;
1554 return (wc);
1555 }
1556
1557 /*
1558 - seterr - set an error condition
1559 == static int seterr(struct parse *p, int e);
1560 */
1561 static int /* useless but makes type checking happy */
1562 seterr(struct parse *p, int e)
1563 {
1564
1565 _DIAGASSERT(p != NULL);
1566
1567 if (p->error == 0) /* keep earliest error condition */
1568 p->error = e;
1569 p->next = nuls; /* try to bring things to a halt */
1570 p->end = nuls;
1571 return(0); /* make the return value well-defined */
1572 }
1573
1574 /*
1575 - allocset - allocate a set of characters for []
1576 == static cset *allocset(struct parse *p);
1577 */
1578 static cset *
1579 allocset(struct parse *p)
1580 {
1581 cset *cs, *ncs;
1582
1583 _DIAGASSERT(p != NULL);
1584
1585 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1586 if (ncs == NULL) {
1587 SETERROR(REG_ESPACE);
1588 return (NULL);
1589 }
1590 p->g->sets = ncs;
1591 cs = &p->g->sets[p->g->ncsets++];
1592 memset(cs, 0, sizeof(*cs));
1593
1594 return(cs);
1595 }
1596
1597 /*
1598 - freeset - free a now-unused set
1599 == static void freeset(struct parse *p, cset *cs);
1600 */
1601 static void
1602 freeset(struct parse *p, cset *cs)
1603 {
1604 cset *top;
1605
1606 _DIAGASSERT(p != NULL);
1607 _DIAGASSERT(cs != NULL);
1608
1609 top = &p->g->sets[p->g->ncsets];
1610
1611 free(cs->wides);
1612 free(cs->ranges);
1613 free(cs->types);
1614 memset(cs, 0, sizeof(*cs));
1615 if (cs == top-1) /* recover only the easy case */
1616 p->g->ncsets--;
1617 }
1618
1619 /*
1620 - singleton - Determine whether a set contains only one character,
1621 - returning it if so, otherwise returning OUT.
1622 */
1623 static wint_t
1624 singleton(cset *cs)
1625 {
1626 wint_t i, s, n;
1627
1628 for (i = n = 0; i < NC; i++)
1629 if (CHIN(cs, i)) {
1630 n++;
1631 s = i;
1632 }
1633 if (n == 1)
1634 return (s);
1635 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1636 cs->icase == 0)
1637 return (cs->wides[0]);
1638 /* Don't bother handling the other cases. */
1639 return (OUT);
1640 }
1641
1642 /*
1643 - CHadd - add character to character set.
1644 */
1645 static void
1646 CHadd(struct parse *p, cset *cs, wint_t ch)
1647 {
1648 wint_t nch, *newwides;
1649
1650 _DIAGASSERT(p != NULL);
1651 _DIAGASSERT(cs != NULL);
1652
1653 assert(ch >= 0);
1654 if (ch < NC)
1655 cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1656 else {
1657 newwides = reallocarray(cs->wides, cs->nwides + 1,
1658 sizeof(*cs->wides));
1659 if (newwides == NULL) {
1660 SETERROR(REG_ESPACE);
1661 return;
1662 }
1663 cs->wides = newwides;
1664 cs->wides[cs->nwides++] = ch;
1665 }
1666 if (cs->icase) {
1667 if ((nch = towlower(ch)) < NC)
1668 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1669 if ((nch = towupper(ch)) < NC)
1670 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1671 }
1672 }
1673
1674 /*
1675 - CHaddrange - add all characters in the range [min,max] to a character set.
1676 */
1677 static void
1678 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1679 {
1680 crange *newranges;
1681
1682 _DIAGASSERT(p != NULL);
1683 _DIAGASSERT(cs != NULL);
1684
1685 for (; min < NC && min <= max; min++)
1686 CHadd(p, cs, min);
1687 if (min >= max)
1688 return;
1689 newranges = reallocarray(cs->ranges, cs->nranges + 1,
1690 sizeof(*cs->ranges));
1691 if (newranges == NULL) {
1692 SETERROR(REG_ESPACE);
1693 return;
1694 }
1695 cs->ranges = newranges;
1696 cs->ranges[cs->nranges].min = min;
1697 cs->ranges[cs->nranges].max = max;
1698 cs->nranges++;
1699 }
1700
1701 /*
1702 - CHaddtype - add all characters of a certain type to a character set.
1703 */
1704 static void
1705 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1706 {
1707 wint_t i;
1708 wctype_t *newtypes;
1709
1710 _DIAGASSERT(p != NULL);
1711 _DIAGASSERT(cs != NULL);
1712
1713 for (i = 0; i < NC; i++)
1714 if (iswctype(i, wct))
1715 CHadd(p, cs, i);
1716 newtypes = reallocarray(cs->types, cs->ntypes + 1,
1717 sizeof(*cs->types));
1718 if (newtypes == NULL) {
1719 SETERROR(REG_ESPACE);
1720 return;
1721 }
1722 cs->types = newtypes;
1723 cs->types[cs->ntypes++] = wct;
1724 }
1725
1726 /*
1727 - dupl - emit a duplicate of a bunch of sops
1728 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1729 */
1730 static sopno /* start of duplicate */
1731 dupl(struct parse *p,
1732 sopno start, /* from here */
1733 sopno finish) /* to this less one */
1734 {
1735 sopno ret = HERE();
1736 sopno len = finish - start;
1737
1738 _DIAGASSERT(p != NULL);
1739
1740 assert(finish >= start);
1741 if (len == 0)
1742 return(ret);
1743 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1744 return(ret);
1745 (void) memcpy(p->strip + p->slen,
1746 p->strip + start, len * sizeof(*p->strip));
1747 p->slen += len;
1748 return(ret);
1749 }
1750
1751 /*
1752 - doemit - emit a strip operator
1753 == static void doemit(struct parse *p, sop op, size_t opnd);
1754 *
1755 * It might seem better to implement this as a macro with a function as
1756 * hard-case backup, but it's just too big and messy unless there are
1757 * some changes to the data structures. Maybe later.
1758 */
1759 static void
1760 doemit(struct parse *p, sop op, size_t opnd)
1761 {
1762 /* avoid making error situations worse */
1763 if (p->error != 0)
1764 return;
1765
1766 _DIAGASSERT(p != NULL);
1767
1768 /* deal with oversize operands ("can't happen", more or less) */
1769 assert(opnd < 1<<OPSHIFT);
1770
1771 /* deal with undersized strip */
1772 if (p->slen >= p->ssize)
1773 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1774 return;
1775
1776 /* finally, it's all reduced to the easy case */
1777 p->strip[p->slen++] = (sopno)SOP(op, opnd);
1778 }
1779
1780 /*
1781 - doinsert - insert a sop into the strip
1782 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1783 */
1784 static void
1785 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1786 {
1787 sopno sn;
1788 sop s;
1789 int i;
1790
1791 _DIAGASSERT(p != NULL);
1792
1793 /* avoid making error situations worse */
1794 if (p->error != 0)
1795 return;
1796
1797 sn = HERE();
1798 EMIT(op, opnd); /* do checks, ensure space */
1799 assert(HERE() == sn+1);
1800 s = p->strip[sn];
1801
1802 /* adjust paren pointers */
1803 assert(pos > 0);
1804 for (i = 1; i < NPAREN; i++) {
1805 if (p->pbegin[i] >= pos) {
1806 p->pbegin[i]++;
1807 }
1808 if (p->pend[i] >= pos) {
1809 p->pend[i]++;
1810 }
1811 }
1812
1813 memmove(&p->strip[pos+1], &p->strip[pos],
1814 (HERE()-pos-1)*sizeof(*p->strip));
1815 p->strip[pos] = s;
1816 }
1817
1818 /*
1819 - dofwd - complete a forward reference
1820 == static void dofwd(struct parse *p, sopno pos, sop value);
1821 */
1822 static void
1823 dofwd(struct parse *p, sopno pos, sop value)
1824 {
1825
1826 _DIAGASSERT(p != NULL);
1827
1828 /* avoid making error situations worse */
1829 if (p->error != 0)
1830 return;
1831
1832 assert(value < 1<<OPSHIFT);
1833 p->strip[pos] = OP(p->strip[pos]) | value;
1834 }
1835
1836 /*
1837 - enlarge - enlarge the strip
1838 == static int enlarge(struct parse *p, sopno size);
1839 */
1840 static int
1841 enlarge(struct parse *p, sopno size)
1842 {
1843 sop *sp;
1844
1845 _DIAGASSERT(p != NULL);
1846
1847 if (p->ssize >= size)
1848 return 1;
1849
1850 sp = reallocarray(p->strip, size, sizeof(*p->strip));
1851 if (sp == NULL) {
1852 SETERROR(REG_ESPACE);
1853 return 0;
1854 }
1855 p->strip = sp;
1856 p->ssize = size;
1857 return 1;
1858 }
1859
1860 /*
1861 - stripsnug - compact the strip
1862 == static void stripsnug(struct parse *p, struct re_guts *g);
1863 */
1864 static void
1865 stripsnug(struct parse *p, struct re_guts *g)
1866 {
1867
1868 _DIAGASSERT(p != NULL);
1869 _DIAGASSERT(g != NULL);
1870
1871 g->nstates = p->slen;
1872 g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1873 if (g->strip == NULL) {
1874 SETERROR(REG_ESPACE);
1875 g->strip = p->strip;
1876 }
1877 }
1878
1879 /*
1880 - findmust - fill in must and mlen with longest mandatory literal string
1881 == static void findmust(struct parse *p, struct re_guts *g);
1882 *
1883 * This algorithm could do fancy things like analyzing the operands of |
1884 * for common subsequences. Someday. This code is simple and finds most
1885 * of the interesting cases.
1886 *
1887 * Note that must and mlen got initialized during setup.
1888 */
1889 static void
1890 findmust(struct parse *p, struct re_guts *g)
1891 {
1892 sop *scan;
1893 sop *start = NULL;
1894 sop *newstart = NULL;
1895 sopno newlen;
1896 sop s;
1897 char *cp;
1898 int offset;
1899 char buf[MB_LEN_MAX];
1900 size_t clen;
1901 mbstate_t mbs;
1902
1903 _DIAGASSERT(p != NULL);
1904 _DIAGASSERT(g != NULL);
1905
1906 /* avoid making error situations worse */
1907 if (p->error != 0)
1908 return;
1909
1910 #ifdef notyet
1911 /*
1912 * It's not generally safe to do a ``char'' substring search on
1913 * multibyte character strings, but it's safe for at least
1914 * UTF-8 (see RFC 3629).
1915 */
1916 if (MB_CUR_MAX > 1 &&
1917 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1918 return;
1919 #endif
1920
1921 /* find the longest OCHAR sequence in strip */
1922 newlen = 0;
1923 offset = 0;
1924 g->moffset = 0;
1925 scan = g->strip + 1;
1926 do {
1927 s = *scan++;
1928 switch (OP(s)) {
1929 case OCHAR: /* sequence member */
1930 if (newlen == 0) { /* new sequence */
1931 memset(&mbs, 0, sizeof(mbs));
1932 newstart = scan - 1;
1933 }
1934 clen = wcrtomb(buf, (int)OPND(s), &mbs);
1935 if (clen == (size_t)-1)
1936 goto toohard;
1937 newlen += (sopno)clen;
1938 break;
1939 case OPLUS_: /* things that don't break one */
1940 case OLPAREN:
1941 case ORPAREN:
1942 break;
1943 case OQUEST_: /* things that must be skipped */
1944 case OCH_:
1945 offset = altoffset(scan, offset);
1946 scan--;
1947 do {
1948 scan += OPND(s);
1949 s = *scan;
1950 /* assert() interferes w debug printouts */
1951 if (OP(s) != O_QUEST &&
1952 OP(s) != O_CH && OP(s) != OOR2) {
1953 g->iflags |= BAD;
1954 return;
1955 }
1956 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1957 /* FALLTHROUGH */
1958 case OBOW: /* things that break a sequence */
1959 case OEOW:
1960 case OBOL:
1961 case OEOL:
1962 case OBOS:
1963 case OEOS:
1964 case OWBND:
1965 case ONWBND:
1966 case O_QUEST:
1967 case O_CH:
1968 case OEND:
1969 if (newlen > (sopno)g->mlen) { /* ends one */
1970 start = newstart;
1971 g->mlen = newlen;
1972 if (offset > -1) {
1973 g->moffset += offset;
1974 offset = newlen;
1975 } else
1976 g->moffset = offset;
1977 } else {
1978 if (offset > -1)
1979 offset += newlen;
1980 }
1981 newlen = 0;
1982 break;
1983 case OANY:
1984 if (newlen > (sopno)g->mlen) { /* ends one */
1985 start = newstart;
1986 g->mlen = newlen;
1987 if (offset > -1) {
1988 g->moffset += offset;
1989 offset = newlen;
1990 } else
1991 g->moffset = offset;
1992 } else {
1993 if (offset > -1)
1994 offset += newlen;
1995 }
1996 if (offset > -1)
1997 offset++;
1998 newlen = 0;
1999 break;
2000 case OANYOF: /* may or may not invalidate offset */
2001 /* First, everything as OANY */
2002 if (newlen > (sopno)g->mlen) { /* ends one */
2003 start = newstart;
2004 g->mlen = newlen;
2005 if (offset > -1) {
2006 g->moffset += offset;
2007 offset = newlen;
2008 } else
2009 g->moffset = offset;
2010 } else {
2011 if (offset > -1)
2012 offset += newlen;
2013 }
2014 if (offset > -1)
2015 offset++;
2016 newlen = 0;
2017 break;
2018 toohard:/*FALLTHROUGH*/
2019 default:
2020 /* Anything here makes it impossible or too hard
2021 * to calculate the offset -- so we give up;
2022 * save the last known good offset, in case the
2023 * must sequence doesn't occur later.
2024 */
2025 if (newlen > (sopno)g->mlen) { /* ends one */
2026 start = newstart;
2027 g->mlen = newlen;
2028 if (offset > -1)
2029 g->moffset += offset;
2030 else
2031 g->moffset = offset;
2032 }
2033 offset = -1;
2034 newlen = 0;
2035 break;
2036 }
2037 } while (OP(s) != OEND);
2038
2039 if (g->mlen == 0) { /* there isn't one */
2040 g->moffset = -1;
2041 return;
2042 }
2043
2044 /* turn it into a character string */
2045 g->must = malloc((size_t)g->mlen + 1);
2046 if (g->must == NULL) { /* argh; just forget it */
2047 g->mlen = 0;
2048 g->moffset = -1;
2049 return;
2050 }
2051 cp = g->must;
2052 scan = start;
2053 memset(&mbs, 0, sizeof(mbs));
2054 while (cp < g->must + g->mlen) {
2055 while (OP(s = *scan++) != OCHAR)
2056 continue;
2057 clen = wcrtomb(cp, (int)OPND(s), &mbs);
2058 assert(clen != (size_t)-1);
2059 cp += clen;
2060 }
2061 assert(cp == g->must + g->mlen);
2062 *cp++ = '\0'; /* just on general principles */
2063 }
2064
2065 /*
2066 - altoffset - choose biggest offset among multiple choices
2067 == static int altoffset(sop *scan, int offset);
2068 *
2069 * Compute, recursively if necessary, the largest offset among multiple
2070 * re paths.
2071 */
2072 static int
2073 altoffset(sop *scan, int offset)
2074 {
2075 int largest;
2076 int try;
2077 sop s;
2078
2079 _DIAGASSERT(scan != NULL);
2080
2081 /* If we gave up already on offsets, return */
2082 if (offset == -1)
2083 return -1;
2084
2085 largest = 0;
2086 try = 0;
2087 s = *scan++;
2088 while (OP(s) != O_QUEST && OP(s) != O_CH) {
2089 switch (OP(s)) {
2090 case OOR1:
2091 if (try > largest)
2092 largest = try;
2093 try = 0;
2094 break;
2095 case OQUEST_:
2096 case OCH_:
2097 try = altoffset(scan, try);
2098 if (try == -1)
2099 return -1;
2100 scan--;
2101 do {
2102 scan += OPND(s);
2103 s = *scan;
2104 if (OP(s) != O_QUEST &&
2105 OP(s) != O_CH && OP(s) != OOR2)
2106 return -1;
2107 } while (OP(s) != O_QUEST && OP(s) != O_CH);
2108 /* We must skip to the next position, or we'll
2109 * leave altoffset() too early.
2110 */
2111 scan++;
2112 break;
2113 case OANYOF:
2114 case OCHAR:
2115 case OANY:
2116 try++;
2117 /*FALLTHROUGH*/
2118 case OBOW:
2119 case OEOW:
2120 case OWBND:
2121 case ONWBND:
2122 case OLPAREN:
2123 case ORPAREN:
2124 case OOR2:
2125 break;
2126 default:
2127 try = -1;
2128 break;
2129 }
2130 if (try == -1)
2131 return -1;
2132 s = *scan++;
2133 }
2134
2135 if (try > largest)
2136 largest = try;
2137
2138 return largest+offset;
2139 }
2140
2141 /*
2142 - computejumps - compute char jumps for BM scan
2143 == static void computejumps(struct parse *p, struct re_guts *g);
2144 *
2145 * This algorithm assumes g->must exists and is has size greater than
2146 * zero. It's based on the algorithm found on Computer Algorithms by
2147 * Sara Baase.
2148 *
2149 * A char jump is the number of characters one needs to jump based on
2150 * the value of the character from the text that was mismatched.
2151 */
2152 static void
2153 computejumps(struct parse *p, struct re_guts *g)
2154 {
2155 int ch;
2156 size_t mindex;
2157
2158 _DIAGASSERT(p != NULL);
2159 _DIAGASSERT(g != NULL);
2160
2161 /* Avoid making errors worse */
2162 if (p->error != 0)
2163 return;
2164
2165 g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2166 if (g->charjump == NULL) /* Not a fatal error */
2167 return;
2168 /* Adjust for signed chars, if necessary */
2169 g->charjump = &g->charjump[-(CHAR_MIN)];
2170
2171 /* If the character does not exist in the pattern, the jump
2172 * is equal to the number of characters in the pattern.
2173 */
2174 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2175 g->charjump[ch] = g->mlen;
2176
2177 /* If the character does exist, compute the jump that would
2178 * take us to the last character in the pattern equal to it
2179 * (notice that we match right to left, so that last character
2180 * is the first one that would be matched).
2181 */
2182 for (mindex = 0; mindex < g->mlen; mindex++)
2183 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2184 }
2185
2186 /*
2187 - computematchjumps - compute match jumps for BM scan
2188 == static void computematchjumps(struct parse *p, struct re_guts *g);
2189 *
2190 * This algorithm assumes g->must exists and is has size greater than
2191 * zero. It's based on the algorithm found on Computer Algorithms by
2192 * Sara Baase.
2193 *
2194 * A match jump is the number of characters one needs to advance based
2195 * on the already-matched suffix.
2196 * Notice that all values here are minus (g->mlen-1), because of the way
2197 * the search algorithm works.
2198 */
2199 static void
2200 computematchjumps(struct parse *p, struct re_guts *g)
2201 {
2202 size_t mindex; /* General "must" iterator */
2203 size_t suffix; /* Keeps track of matching suffix */
2204 size_t ssuffix; /* Keeps track of suffixes' suffix */
2205 size_t* pmatches; /* pmatches[k] points to the next i
2206 * such that i+1...mlen is a substring
2207 * of k+1...k+mlen-i-1
2208 */
2209
2210 _DIAGASSERT(p != NULL);
2211 _DIAGASSERT(g != NULL);
2212
2213 /* Avoid making errors worse */
2214 if (p->error != 0)
2215 return;
2216
2217 pmatches = calloc(g->mlen, sizeof(*pmatches));
2218 if (pmatches == NULL) {
2219 g->matchjump = NULL;
2220 return;
2221 }
2222
2223 g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2224 if (g->matchjump == NULL) { /* Not a fatal error */
2225 free(pmatches);
2226 return;
2227 }
2228
2229 /* Set maximum possible jump for each character in the pattern */
2230 for (mindex = 0; mindex < g->mlen; mindex++)
2231 g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2232
2233 /* Compute pmatches[] */
2234 for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2235 pmatches[mindex] = suffix;
2236
2237 /* If a mismatch is found, interrupting the substring,
2238 * compute the matchjump for that position. If no
2239 * mismatch is found, then a text substring mismatched
2240 * against the suffix will also mismatch against the
2241 * substring.
2242 */
2243 while (suffix < g->mlen
2244 && g->must[mindex] != g->must[suffix]) {
2245 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2246 g->mlen - mindex - 1);
2247 suffix = pmatches[suffix];
2248 }
2249 }
2250
2251 /* Compute the matchjump up to the last substring found to jump
2252 * to the beginning of the largest must pattern prefix matching
2253 * it's own suffix.
2254 */
2255 for (mindex = 0; mindex <= suffix; mindex++)
2256 g->matchjump[mindex] = MIN(g->matchjump[mindex],
2257 g->mlen + suffix - mindex);
2258
2259 ssuffix = pmatches[suffix];
2260 while (suffix < g->mlen) {
2261 while (suffix <= ssuffix && suffix < g->mlen) {
2262 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2263 g->mlen + ssuffix - suffix);
2264 suffix++;
2265 }
2266 if (suffix < g->mlen)
2267 ssuffix = pmatches[ssuffix];
2268 }
2269
2270 free(pmatches);
2271 }
2272
2273 /*
2274 - pluscount - count + nesting
2275 == static sopno pluscount(struct parse *p, struct re_guts *g);
2276 */
2277 static sopno /* nesting depth */
2278 pluscount(struct parse *p, struct re_guts *g)
2279 {
2280 sop *scan;
2281 sop s;
2282 sopno plusnest = 0;
2283 sopno maxnest = 0;
2284
2285 _DIAGASSERT(p != NULL);
2286 _DIAGASSERT(g != NULL);
2287
2288 if (p->error != 0)
2289 return(0); /* there may not be an OEND */
2290
2291 scan = g->strip + 1;
2292 do {
2293 s = *scan++;
2294 switch (OP(s)) {
2295 case OPLUS_:
2296 plusnest++;
2297 break;
2298 case O_PLUS:
2299 if (plusnest > maxnest)
2300 maxnest = plusnest;
2301 plusnest--;
2302 break;
2303 }
2304 } while (OP(s) != OEND);
2305 if (plusnest != 0)
2306 g->iflags |= BAD;
2307 return(maxnest);
2308 }
2309