regcomp.c revision 1.42 1 1.42 christos /* $NetBSD: regcomp.c,v 1.42 2021/02/25 21:28:40 christos Exp $ */
2 1.6 cgd
3 1.5 cgd /*-
4 1.39 christos * SPDX-License-Identifier: BSD-3-Clause
5 1.39 christos *
6 1.39 christos * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 1.5 cgd * Copyright (c) 1992, 1993, 1994
8 1.5 cgd * The Regents of the University of California. All rights reserved.
9 1.5 cgd *
10 1.39 christos * Copyright (c) 2011 The FreeBSD Foundation
11 1.39 christos * All rights reserved.
12 1.39 christos * Portions of this software were developed by David Chisnall
13 1.39 christos * under sponsorship from the FreeBSD Foundation.
14 1.39 christos *
15 1.5 cgd * This code is derived from software contributed to Berkeley by
16 1.5 cgd * Henry Spencer.
17 1.5 cgd *
18 1.5 cgd * Redistribution and use in source and binary forms, with or without
19 1.5 cgd * modification, are permitted provided that the following conditions
20 1.5 cgd * are met:
21 1.5 cgd * 1. Redistributions of source code must retain the above copyright
22 1.5 cgd * notice, this list of conditions and the following disclaimer.
23 1.5 cgd * 2. Redistributions in binary form must reproduce the above copyright
24 1.5 cgd * notice, this list of conditions and the following disclaimer in the
25 1.5 cgd * documentation and/or other materials provided with the distribution.
26 1.18 agc * 3. Neither the name of the University nor the names of its contributors
27 1.18 agc * may be used to endorse or promote products derived from this software
28 1.18 agc * without specific prior written permission.
29 1.18 agc *
30 1.18 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 1.18 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 1.18 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 1.18 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 1.18 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 1.18 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 1.18 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 1.18 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 1.18 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 1.18 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 1.18 agc * SUCH DAMAGE.
41 1.18 agc *
42 1.18 agc * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 1.18 agc */
44 1.18 agc
45 1.7 christos #include <sys/cdefs.h>
46 1.6 cgd #if 0
47 1.5 cgd static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
48 1.39 christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
49 1.6 cgd #endif
50 1.42 christos __RCSID("$NetBSD: regcomp.c,v 1.42 2021/02/25 21:28:40 christos Exp $");
51 1.39 christos
52 1.39 christos #define _OPENBSD_SOURCE
53 1.42 christos
54 1.42 christos #ifndef LIBHACK
55 1.39 christos #define REGEX_GNU_EXTENSIONS
56 1.5 cgd
57 1.8 jtc #include "namespace.h"
58 1.42 christos #endif
59 1.1 jtc #include <sys/types.h>
60 1.39 christos #include <stdio.h>
61 1.39 christos #include <string.h>
62 1.1 jtc #include <ctype.h>
63 1.1 jtc #include <limits.h>
64 1.1 jtc #include <stdlib.h>
65 1.28 junyoung #include <regex.h>
66 1.39 christos #include <stdbool.h>
67 1.8 jtc
68 1.42 christos #if defined(_weak_alias) && !defined(LIBHACK)
69 1.16 mycroft __weak_alias(regcomp,_regcomp)
70 1.8 jtc #endif
71 1.1 jtc
72 1.39 christos #ifdef REGEX_LIBC_COLLATE
73 1.39 christos #include "collate.h"
74 1.39 christos #endif
75 1.39 christos
76 1.1 jtc #include "utils.h"
77 1.1 jtc #include "regex2.h"
78 1.1 jtc
79 1.1 jtc #include "cname.h"
80 1.1 jtc
81 1.1 jtc /*
82 1.39 christos * Branching context, used to keep track of branch state for all of the branch-
83 1.39 christos * aware functions. In addition to keeping track of branch positions for the
84 1.39 christos * p_branch_* functions, we use this to simplify some clumsiness in BREs for
85 1.39 christos * detection of whether ^ is acting as an anchor or being used erroneously and
86 1.39 christos * also for whether we're in a sub-expression or not.
87 1.39 christos */
88 1.39 christos struct branchc {
89 1.39 christos sopno start;
90 1.39 christos sopno back;
91 1.39 christos sopno fwd;
92 1.39 christos
93 1.39 christos int nbranch;
94 1.39 christos int nchain;
95 1.39 christos bool outer;
96 1.39 christos bool terminate;
97 1.39 christos };
98 1.39 christos
99 1.39 christos /*
100 1.1 jtc * parse structure, passed up and down to avoid global variables and
101 1.1 jtc * other clumsinesses
102 1.1 jtc */
103 1.1 jtc struct parse {
104 1.21 yamt const char *next; /* next character in RE */
105 1.21 yamt const char *end; /* end of string (-> NUL normally) */
106 1.1 jtc int error; /* has an error been seen? */
107 1.39 christos int gnuext;
108 1.1 jtc sop *strip; /* malloced strip */
109 1.1 jtc sopno ssize; /* malloced strip size (allocated) */
110 1.1 jtc sopno slen; /* malloced strip length (used) */
111 1.33 christos size_t ncsalloc; /* number of csets allocated */
112 1.1 jtc struct re_guts *g;
113 1.1 jtc # define NPAREN 10 /* we need to remember () 1-9 for back refs */
114 1.1 jtc sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
115 1.1 jtc sopno pend[NPAREN]; /* -> ) ([0] unused) */
116 1.39 christos bool allowbranch; /* can this expression branch? */
117 1.39 christos bool bre; /* convenience; is this a BRE? */
118 1.39 christos int pflags; /* other parsing flags -- legacy escapes? */
119 1.39 christos bool (*parse_expr)(struct parse *, struct branchc *);
120 1.39 christos void (*pre_parse)(struct parse *, struct branchc *);
121 1.39 christos void (*post_parse)(struct parse *, struct branchc *);
122 1.1 jtc };
123 1.1 jtc
124 1.39 christos #define PFLAG_LEGACY_ESC 0x00000001
125 1.39 christos
126 1.5 cgd /* ========= begin header generated by ./mkh ========= */
127 1.5 cgd #ifdef __cplusplus
128 1.5 cgd extern "C" {
129 1.5 cgd #endif
130 1.5 cgd
131 1.5 cgd /* === regcomp.c === */
132 1.39 christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
133 1.26 junyoung static void p_str(struct parse *p);
134 1.39 christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
135 1.39 christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
136 1.39 christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
137 1.39 christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
138 1.39 christos static bool p_branch_do(struct parse *p, struct branchc *bc);
139 1.39 christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
140 1.39 christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
141 1.39 christos static void p_re(struct parse *p, int end1, int end2);
142 1.39 christos static bool p_simp_re(struct parse *p, struct branchc *bc);
143 1.26 junyoung static int p_count(struct parse *p);
144 1.26 junyoung static void p_bracket(struct parse *p);
145 1.39 christos static int p_range_cmp(wchar_t c1, wchar_t c2);
146 1.26 junyoung static void p_b_term(struct parse *p, cset *cs);
147 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
148 1.39 christos static int p_b_pseudoclass(struct parse *p, char c);
149 1.39 christos #endif
150 1.26 junyoung static void p_b_cclass(struct parse *p, cset *cs);
151 1.39 christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
152 1.26 junyoung static void p_b_eclass(struct parse *p, cset *cs);
153 1.39 christos static wint_t p_b_symbol(struct parse *p);
154 1.39 christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
155 1.39 christos static bool may_escape(struct parse *p, const wint_t ch);
156 1.39 christos static wint_t othercase(wint_t ch);
157 1.39 christos static void bothcases(struct parse *p, wint_t ch);
158 1.39 christos static void ordinary(struct parse *p, wint_t ch);
159 1.26 junyoung static void nonnewline(struct parse *p);
160 1.39 christos static void repeat(struct parse *p, sopno start, int from, int to);
161 1.26 junyoung static int seterr(struct parse *p, int e);
162 1.26 junyoung static cset *allocset(struct parse *p);
163 1.26 junyoung static void freeset(struct parse *p, cset *cs);
164 1.39 christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
165 1.39 christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
166 1.39 christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
167 1.39 christos static wint_t singleton(cset *cs);
168 1.26 junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
169 1.39 christos static void doemit(struct parse *p, sop op, size_t opnd);
170 1.39 christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
171 1.39 christos static void dofwd(struct parse *p, sopno pos, sop value);
172 1.30 christos static int enlarge(struct parse *p, sopno size);
173 1.26 junyoung static void stripsnug(struct parse *p, struct re_guts *g);
174 1.26 junyoung static void findmust(struct parse *p, struct re_guts *g);
175 1.39 christos static int altoffset(sop *scan, int offset);
176 1.39 christos static void computejumps(struct parse *p, struct re_guts *g);
177 1.39 christos static void computematchjumps(struct parse *p, struct re_guts *g);
178 1.26 junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
179 1.39 christos static wint_t wgetnext(struct parse *p);
180 1.5 cgd
181 1.5 cgd #ifdef __cplusplus
182 1.5 cgd }
183 1.5 cgd #endif
184 1.5 cgd /* ========= end header generated by ./mkh ========= */
185 1.1 jtc
186 1.1 jtc static char nuls[10]; /* place to point scanner in event of error */
187 1.1 jtc
188 1.1 jtc /*
189 1.1 jtc * macros for use with parse structure
190 1.1 jtc * BEWARE: these know that the parse structure is named `p' !!!
191 1.1 jtc */
192 1.1 jtc #define PEEK() (*p->next)
193 1.1 jtc #define PEEK2() (*(p->next+1))
194 1.1 jtc #define MORE() (p->next < p->end)
195 1.1 jtc #define MORE2() (p->next+1 < p->end)
196 1.1 jtc #define SEE(c) (MORE() && PEEK() == (c))
197 1.1 jtc #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
198 1.39 christos #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
199 1.1 jtc #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
200 1.1 jtc #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
201 1.39 christos #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
202 1.1 jtc #define NEXT() (p->next++)
203 1.1 jtc #define NEXT2() (p->next += 2)
204 1.1 jtc #define NEXTn(n) (p->next += (n))
205 1.1 jtc #define GETNEXT() (*p->next++)
206 1.39 christos #define WGETNEXT() wgetnext(p)
207 1.1 jtc #define SETERROR(e) seterr(p, (e))
208 1.39 christos #define REQUIRE(co, e) ((co) || SETERROR(e))
209 1.1 jtc #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
210 1.39 christos #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
211 1.1 jtc #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
212 1.40 christos #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
213 1.40 christos #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
214 1.12 drochner #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
215 1.1 jtc #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
216 1.1 jtc #define HERE() (p->slen)
217 1.1 jtc #define THERE() (p->slen - 1)
218 1.4 jtc #define THERETHERE() (p->slen - 2)
219 1.1 jtc #define DROP(n) (p->slen -= (n))
220 1.1 jtc
221 1.39 christos /* Macro used by computejump()/computematchjump() */
222 1.41 christos #ifndef MIN
223 1.39 christos #define MIN(a,b) ((a)<(b)?(a):(b))
224 1.41 christos #endif
225 1.30 christos
226 1.42 christos #ifndef NLS
227 1.42 christos static const struct {
228 1.42 christos const char *name;
229 1.42 christos int (*func)(int);
230 1.42 christos } wctypes[] = {
231 1.42 christos #define ADD(x) { .name = # x, .func = is ## x }
232 1.42 christos ADD(alnum),
233 1.42 christos ADD(alpha),
234 1.42 christos ADD(blank),
235 1.42 christos ADD(cntrl),
236 1.42 christos ADD(digit),
237 1.42 christos ADD(graph),
238 1.42 christos ADD(lower),
239 1.42 christos ADD(print),
240 1.42 christos ADD(punct),
241 1.42 christos ADD(space),
242 1.42 christos ADD(upper),
243 1.42 christos ADD(xdigit),
244 1.42 christos #undef ADD
245 1.42 christos };
246 1.42 christos
247 1.42 christos wctype_t
248 1.42 christos __regex_wctype(const char *str)
249 1.42 christos {
250 1.42 christos for (size_t i = 0; i < __arraycount(wctypes); i++) {
251 1.42 christos if (strcmp(wctypes[i].name, str) == 0)
252 1.42 christos return (wctype_t)(i + 1);
253 1.42 christos }
254 1.42 christos return (wctype_t)0;
255 1.42 christos }
256 1.42 christos
257 1.42 christos int
258 1.42 christos __regex_iswctype(wint_t c, wctype_t ct)
259 1.42 christos {
260 1.42 christos if (ct == 0)
261 1.42 christos return 0;
262 1.42 christos return (*wctypes[ct - 1].func)(c);
263 1.42 christos }
264 1.42 christos #endif
265 1.42 christos
266 1.39 christos static int /* 0 success, otherwise REG_something */
267 1.39 christos regcomp_internal(regex_t * __restrict preg,
268 1.39 christos const char * __restrict pattern,
269 1.39 christos int cflags, int pflags)
270 1.1 jtc {
271 1.1 jtc struct parse pa;
272 1.9 perry struct re_guts *g;
273 1.9 perry struct parse *p = &pa;
274 1.9 perry int i;
275 1.9 perry size_t len;
276 1.39 christos size_t maxlen;
277 1.3 jtc #ifdef REDEBUG
278 1.3 jtc # define GOODFLAGS(f) (f)
279 1.3 jtc #else
280 1.3 jtc # define GOODFLAGS(f) ((f)&~REG_DUMP)
281 1.3 jtc #endif
282 1.1 jtc
283 1.14 lukem _DIAGASSERT(preg != NULL);
284 1.14 lukem _DIAGASSERT(pattern != NULL);
285 1.14 lukem
286 1.3 jtc cflags = GOODFLAGS(cflags);
287 1.1 jtc if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
288 1.1 jtc return(REG_INVARG);
289 1.1 jtc
290 1.1 jtc if (cflags®_PEND) {
291 1.1 jtc if (preg->re_endp < pattern)
292 1.1 jtc return(REG_INVARG);
293 1.1 jtc len = preg->re_endp - pattern;
294 1.1 jtc } else
295 1.11 christos len = strlen(pattern);
296 1.1 jtc
297 1.1 jtc /* do the mallocs early so failure handling is easy */
298 1.39 christos g = malloc(sizeof(*g));
299 1.1 jtc if (g == NULL)
300 1.1 jtc return(REG_ESPACE);
301 1.39 christos /*
302 1.39 christos * Limit the pattern space to avoid a 32-bit overflow on buffer
303 1.39 christos * extension. Also avoid any signed overflow in case of conversion
304 1.39 christos * so make the real limit based on a 31-bit overflow.
305 1.39 christos *
306 1.39 christos * Likely not applicable on 64-bit systems but handle the case
307 1.39 christos * generically (who are we to stop people from using ~715MB+
308 1.39 christos * patterns?).
309 1.39 christos */
310 1.40 christos maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
311 1.39 christos if (len >= maxlen) {
312 1.40 christos free(g);
313 1.39 christos return(REG_ESPACE);
314 1.39 christos }
315 1.40 christos p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
316 1.39 christos assert(p->ssize >= len);
317 1.39 christos
318 1.39 christos p->strip = calloc(p->ssize, sizeof(*p->strip));
319 1.1 jtc p->slen = 0;
320 1.1 jtc if (p->strip == NULL) {
321 1.40 christos free(g);
322 1.1 jtc return(REG_ESPACE);
323 1.1 jtc }
324 1.1 jtc
325 1.1 jtc /* set things up */
326 1.1 jtc p->g = g;
327 1.39 christos p->next = pattern; /* convenience; we do not modify it */
328 1.1 jtc p->end = p->next + len;
329 1.1 jtc p->error = 0;
330 1.1 jtc p->ncsalloc = 0;
331 1.39 christos p->pflags = pflags;
332 1.1 jtc for (i = 0; i < NPAREN; i++) {
333 1.1 jtc p->pbegin[i] = 0;
334 1.1 jtc p->pend[i] = 0;
335 1.1 jtc }
336 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
337 1.39 christos if ((cflags & REG_GNU) == 0) {
338 1.39 christos p->gnuext = false;
339 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
340 1.39 christos } else
341 1.39 christos p->gnuext = p->allowbranch = true;
342 1.39 christos #else
343 1.39 christos p->gnuext = false;
344 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
345 1.39 christos #endif
346 1.39 christos if (cflags & REG_EXTENDED) {
347 1.39 christos p->bre = false;
348 1.39 christos p->parse_expr = p_ere_exp;
349 1.39 christos p->pre_parse = NULL;
350 1.39 christos p->post_parse = NULL;
351 1.39 christos } else {
352 1.39 christos p->bre = true;
353 1.39 christos p->parse_expr = p_simp_re;
354 1.39 christos p->pre_parse = p_bre_pre_parse;
355 1.39 christos p->post_parse = p_bre_post_parse;
356 1.39 christos }
357 1.1 jtc g->sets = NULL;
358 1.1 jtc g->ncsets = 0;
359 1.1 jtc g->cflags = cflags;
360 1.1 jtc g->iflags = 0;
361 1.1 jtc g->nbol = 0;
362 1.1 jtc g->neol = 0;
363 1.1 jtc g->must = NULL;
364 1.39 christos g->moffset = -1;
365 1.39 christos g->charjump = NULL;
366 1.39 christos g->matchjump = NULL;
367 1.1 jtc g->mlen = 0;
368 1.1 jtc g->nsub = 0;
369 1.1 jtc g->backrefs = 0;
370 1.1 jtc
371 1.1 jtc /* do it */
372 1.1 jtc EMIT(OEND, 0);
373 1.1 jtc g->firststate = THERE();
374 1.39 christos if (cflags & REG_NOSPEC)
375 1.1 jtc p_str(p);
376 1.1 jtc else
377 1.39 christos p_re(p, OUT, OUT);
378 1.1 jtc EMIT(OEND, 0);
379 1.1 jtc g->laststate = THERE();
380 1.1 jtc
381 1.1 jtc /* tidy up loose ends and fill things in */
382 1.1 jtc stripsnug(p, g);
383 1.1 jtc findmust(p, g);
384 1.39 christos /* only use Boyer-Moore algorithm if the pattern is bigger
385 1.39 christos * than three characters
386 1.39 christos */
387 1.39 christos if(g->mlen > 3) {
388 1.39 christos computejumps(p, g);
389 1.39 christos computematchjumps(p, g);
390 1.39 christos if(g->matchjump == NULL && g->charjump != NULL) {
391 1.39 christos free(g->charjump);
392 1.39 christos g->charjump = NULL;
393 1.39 christos }
394 1.39 christos }
395 1.1 jtc g->nplus = pluscount(p, g);
396 1.1 jtc g->magic = MAGIC2;
397 1.1 jtc preg->re_nsub = g->nsub;
398 1.1 jtc preg->re_g = g;
399 1.1 jtc preg->re_magic = MAGIC1;
400 1.1 jtc #ifndef REDEBUG
401 1.1 jtc /* not debugging, so can't rely on the assert() in regexec() */
402 1.1 jtc if (g->iflags&BAD)
403 1.1 jtc SETERROR(REG_ASSERT);
404 1.1 jtc #endif
405 1.1 jtc
406 1.1 jtc /* win or lose, we're done */
407 1.1 jtc if (p->error != 0) /* lose */
408 1.1 jtc regfree(preg);
409 1.1 jtc return(p->error);
410 1.1 jtc }
411 1.1 jtc
412 1.1 jtc /*
413 1.39 christos - regcomp - interface for parser and compilation
414 1.39 christos = extern int regcomp(regex_t *, const char *, int);
415 1.39 christos = #define REG_BASIC 0000
416 1.39 christos = #define REG_EXTENDED 0001
417 1.39 christos = #define REG_ICASE 0002
418 1.39 christos = #define REG_NOSUB 0004
419 1.39 christos = #define REG_NEWLINE 0010
420 1.39 christos = #define REG_NOSPEC 0020
421 1.39 christos = #define REG_PEND 0040
422 1.39 christos = #define REG_DUMP 0200
423 1.1 jtc */
424 1.39 christos int /* 0 success, otherwise REG_something */
425 1.39 christos regcomp(regex_t * __restrict preg,
426 1.39 christos const char * __restrict pattern,
427 1.39 christos int cflags)
428 1.1 jtc {
429 1.30 christos
430 1.39 christos return (regcomp_internal(preg, pattern, cflags, 0));
431 1.1 jtc }
432 1.1 jtc
433 1.1 jtc /*
434 1.39 christos - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
435 1.39 christos - return whether we should terminate or not
436 1.39 christos == static bool p_ere_exp(struct parse *p);
437 1.1 jtc */
438 1.39 christos static bool
439 1.39 christos p_ere_exp(struct parse *p, struct branchc *bc)
440 1.1 jtc {
441 1.9 perry char c;
442 1.39 christos wint_t wc;
443 1.9 perry sopno pos;
444 1.9 perry int count;
445 1.9 perry int count2;
446 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
447 1.40 christos size_t i;
448 1.39 christos int handled;
449 1.39 christos #endif
450 1.9 perry sopno subno;
451 1.1 jtc int wascaret = 0;
452 1.1 jtc
453 1.14 lukem _DIAGASSERT(p != NULL);
454 1.14 lukem
455 1.39 christos (void)bc;
456 1.1 jtc assert(MORE()); /* caller should have ensured this */
457 1.1 jtc c = GETNEXT();
458 1.1 jtc
459 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
460 1.39 christos handled = 0;
461 1.39 christos #endif
462 1.1 jtc pos = HERE();
463 1.1 jtc switch (c) {
464 1.1 jtc case '(':
465 1.39 christos (void)REQUIRE(MORE(), REG_EPAREN);
466 1.1 jtc p->g->nsub++;
467 1.40 christos subno = (sopno)p->g->nsub;
468 1.1 jtc if (subno < NPAREN)
469 1.1 jtc p->pbegin[subno] = HERE();
470 1.1 jtc EMIT(OLPAREN, subno);
471 1.1 jtc if (!SEE(')'))
472 1.39 christos p_re(p, ')', IGN);
473 1.1 jtc if (subno < NPAREN) {
474 1.1 jtc p->pend[subno] = HERE();
475 1.1 jtc assert(p->pend[subno] != 0);
476 1.1 jtc }
477 1.1 jtc EMIT(ORPAREN, subno);
478 1.39 christos (void)MUSTEAT(')', REG_EPAREN);
479 1.1 jtc break;
480 1.1 jtc #ifndef POSIX_MISTAKE
481 1.1 jtc case ')': /* happens only if no current unmatched ( */
482 1.1 jtc /*
483 1.1 jtc * You may ask, why the ifndef? Because I didn't notice
484 1.1 jtc * this until slightly too late for 1003.2, and none of the
485 1.1 jtc * other 1003.2 regular-expression reviewers noticed it at
486 1.1 jtc * all. So an unmatched ) is legal POSIX, at least until
487 1.1 jtc * we can get it fixed.
488 1.1 jtc */
489 1.1 jtc SETERROR(REG_EPAREN);
490 1.1 jtc break;
491 1.1 jtc #endif
492 1.1 jtc case '^':
493 1.1 jtc EMIT(OBOL, 0);
494 1.1 jtc p->g->iflags |= USEBOL;
495 1.1 jtc p->g->nbol++;
496 1.1 jtc wascaret = 1;
497 1.1 jtc break;
498 1.1 jtc case '$':
499 1.1 jtc EMIT(OEOL, 0);
500 1.1 jtc p->g->iflags |= USEEOL;
501 1.1 jtc p->g->neol++;
502 1.1 jtc break;
503 1.1 jtc case '|':
504 1.1 jtc SETERROR(REG_EMPTY);
505 1.1 jtc break;
506 1.1 jtc case '*':
507 1.1 jtc case '+':
508 1.1 jtc case '?':
509 1.39 christos case '{':
510 1.1 jtc SETERROR(REG_BADRPT);
511 1.1 jtc break;
512 1.1 jtc case '.':
513 1.1 jtc if (p->g->cflags®_NEWLINE)
514 1.1 jtc nonnewline(p);
515 1.1 jtc else
516 1.1 jtc EMIT(OANY, 0);
517 1.1 jtc break;
518 1.1 jtc case '[':
519 1.1 jtc p_bracket(p);
520 1.1 jtc break;
521 1.1 jtc case '\\':
522 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
523 1.39 christos wc = WGETNEXT();
524 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
525 1.39 christos if (p->gnuext) {
526 1.39 christos handled = 1;
527 1.39 christos switch (wc) {
528 1.39 christos case '`':
529 1.39 christos EMIT(OBOS, 0);
530 1.39 christos break;
531 1.39 christos case '\'':
532 1.39 christos EMIT(OEOS, 0);
533 1.39 christos break;
534 1.39 christos case 'B':
535 1.39 christos EMIT(ONWBND, 0);
536 1.39 christos break;
537 1.39 christos case 'b':
538 1.39 christos EMIT(OWBND, 0);
539 1.39 christos break;
540 1.39 christos case 'W':
541 1.39 christos case 'w':
542 1.39 christos case 'S':
543 1.39 christos case 's':
544 1.39 christos p_b_pseudoclass(p, wc);
545 1.39 christos break;
546 1.39 christos case '1':
547 1.39 christos case '2':
548 1.39 christos case '3':
549 1.39 christos case '4':
550 1.39 christos case '5':
551 1.39 christos case '6':
552 1.39 christos case '7':
553 1.39 christos case '8':
554 1.39 christos case '9':
555 1.39 christos i = wc - '0';
556 1.39 christos assert(i < NPAREN);
557 1.39 christos if (p->pend[i] != 0) {
558 1.39 christos assert(i <= p->g->nsub);
559 1.39 christos EMIT(OBACK_, i);
560 1.39 christos assert(p->pbegin[i] != 0);
561 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
562 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
563 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
564 1.39 christos EMIT(O_BACK, i);
565 1.39 christos } else
566 1.39 christos SETERROR(REG_ESUBREG);
567 1.39 christos p->g->backrefs = 1;
568 1.39 christos break;
569 1.39 christos default:
570 1.39 christos handled = 0;
571 1.39 christos }
572 1.39 christos /* Don't proceed to the POSIX bits if we've already handled it */
573 1.39 christos if (handled)
574 1.39 christos break;
575 1.39 christos }
576 1.39 christos #endif
577 1.39 christos switch (wc) {
578 1.39 christos case '<':
579 1.39 christos EMIT(OBOW, 0);
580 1.39 christos break;
581 1.39 christos case '>':
582 1.39 christos EMIT(OEOW, 0);
583 1.39 christos break;
584 1.39 christos default:
585 1.39 christos if (may_escape(p, wc))
586 1.39 christos ordinary(p, wc);
587 1.39 christos else
588 1.39 christos SETERROR(REG_EESCAPE);
589 1.39 christos break;
590 1.39 christos }
591 1.1 jtc break;
592 1.1 jtc default:
593 1.38 christos if (p->error != 0)
594 1.39 christos return (false);
595 1.39 christos p->next--;
596 1.39 christos wc = WGETNEXT();
597 1.39 christos ordinary(p, wc);
598 1.1 jtc break;
599 1.1 jtc }
600 1.1 jtc
601 1.1 jtc if (!MORE())
602 1.39 christos return (false);
603 1.1 jtc c = PEEK();
604 1.1 jtc /* we call { a repetition if followed by a digit */
605 1.39 christos if (!( c == '*' || c == '+' || c == '?' || c == '{'))
606 1.39 christos return (false); /* no repetition, we're done */
607 1.39 christos else if (c == '{')
608 1.39 christos (void)REQUIRE(MORE2() && \
609 1.39 christos (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
610 1.1 jtc NEXT();
611 1.1 jtc
612 1.39 christos (void)REQUIRE(!wascaret, REG_BADRPT);
613 1.1 jtc switch (c) {
614 1.1 jtc case '*': /* implemented as +? */
615 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
616 1.1 jtc INSERT(OPLUS_, pos);
617 1.1 jtc ASTERN(O_PLUS, pos);
618 1.1 jtc INSERT(OQUEST_, pos);
619 1.1 jtc ASTERN(O_QUEST, pos);
620 1.1 jtc break;
621 1.1 jtc case '+':
622 1.1 jtc INSERT(OPLUS_, pos);
623 1.1 jtc ASTERN(O_PLUS, pos);
624 1.1 jtc break;
625 1.1 jtc case '?':
626 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
627 1.4 jtc INSERT(OCH_, pos); /* offset slightly wrong */
628 1.4 jtc ASTERN(OOR1, pos); /* this one's right */
629 1.4 jtc AHEAD(pos); /* fix the OCH_ */
630 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
631 1.4 jtc AHEAD(THERE()); /* ...so fix it */
632 1.4 jtc ASTERN(O_CH, THERETHERE());
633 1.1 jtc break;
634 1.1 jtc case '{':
635 1.1 jtc count = p_count(p);
636 1.1 jtc if (EAT(',')) {
637 1.39 christos if (isdigit((uch)PEEK())) {
638 1.1 jtc count2 = p_count(p);
639 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
640 1.1 jtc } else /* single number with comma */
641 1.1 jtc count2 = INFINITY;
642 1.1 jtc } else /* just a single number */
643 1.1 jtc count2 = count;
644 1.39 christos repeat(p, pos, count, count2);
645 1.1 jtc if (!EAT('}')) { /* error heuristics */
646 1.1 jtc while (MORE() && PEEK() != '}')
647 1.1 jtc NEXT();
648 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
649 1.1 jtc SETERROR(REG_BADBR);
650 1.1 jtc }
651 1.1 jtc break;
652 1.1 jtc }
653 1.1 jtc
654 1.1 jtc if (!MORE())
655 1.39 christos return (false);
656 1.1 jtc c = PEEK();
657 1.1 jtc if (!( c == '*' || c == '+' || c == '?' ||
658 1.39 christos (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
659 1.39 christos return (false);
660 1.1 jtc SETERROR(REG_BADRPT);
661 1.39 christos return (false);
662 1.1 jtc }
663 1.1 jtc
664 1.1 jtc /*
665 1.1 jtc - p_str - string (no metacharacters) "parser"
666 1.9 perry == static void p_str(struct parse *p);
667 1.1 jtc */
668 1.1 jtc static void
669 1.39 christos p_str(struct parse *p)
670 1.1 jtc {
671 1.39 christos (void)REQUIRE(MORE(), REG_EMPTY);
672 1.39 christos while (MORE())
673 1.39 christos ordinary(p, WGETNEXT());
674 1.39 christos }
675 1.14 lukem
676 1.39 christos /*
677 1.39 christos * Eat consecutive branch delimiters for the kind of expression that we are
678 1.39 christos * parsing, return the number of delimiters that we ate.
679 1.39 christos */
680 1.39 christos static int
681 1.39 christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
682 1.39 christos {
683 1.39 christos int nskip;
684 1.14 lukem
685 1.39 christos (void)bc;
686 1.39 christos nskip = 0;
687 1.39 christos while (EATSPEC('|'))
688 1.39 christos ++nskip;
689 1.39 christos return (nskip);
690 1.1 jtc }
691 1.1 jtc
692 1.1 jtc /*
693 1.39 christos * Insert necessary branch book-keeping operations. This emits a
694 1.39 christos * bogus 'next' offset, since we still have more to parse
695 1.1 jtc */
696 1.1 jtc static void
697 1.39 christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
698 1.9 perry {
699 1.1 jtc
700 1.39 christos if (bc->nbranch == 0) {
701 1.39 christos INSERT(OCH_, bc->start); /* offset is wrong */
702 1.39 christos bc->fwd = bc->start;
703 1.39 christos bc->back = bc->start;
704 1.39 christos }
705 1.39 christos
706 1.39 christos ASTERN(OOR1, bc->back);
707 1.39 christos bc->back = THERE();
708 1.39 christos AHEAD(bc->fwd); /* fix previous offset */
709 1.39 christos bc->fwd = HERE();
710 1.39 christos EMIT(OOR2, 0); /* offset is very wrong */
711 1.39 christos ++bc->nbranch;
712 1.39 christos }
713 1.39 christos
714 1.39 christos /*
715 1.39 christos * Fix the offset of the tail branch, if we actually had any branches.
716 1.39 christos * This is to correct the bogus placeholder offset that we use.
717 1.39 christos */
718 1.39 christos static void
719 1.39 christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
720 1.39 christos {
721 1.14 lukem
722 1.39 christos /* Fix bogus offset at the tail if we actually have branches */
723 1.39 christos if (bc->nbranch > 0) {
724 1.39 christos AHEAD(bc->fwd);
725 1.39 christos ASTERN(O_CH, bc->back);
726 1.30 christos }
727 1.39 christos }
728 1.39 christos
729 1.39 christos /*
730 1.39 christos * Signal to the parser that an empty branch has been encountered; this will,
731 1.39 christos * in the future, be used to allow for more permissive behavior with empty
732 1.39 christos * branches. The return value should indicate whether parsing may continue
733 1.39 christos * or not.
734 1.39 christos */
735 1.39 christos static bool
736 1.39 christos p_branch_empty(struct parse *p, struct branchc *bc)
737 1.39 christos {
738 1.39 christos
739 1.39 christos (void)bc;
740 1.39 christos SETERROR(REG_EMPTY);
741 1.39 christos return (false);
742 1.39 christos }
743 1.39 christos
744 1.39 christos /*
745 1.39 christos * Take care of any branching requirements. This includes inserting the
746 1.39 christos * appropriate branching instructions as well as eating all of the branch
747 1.39 christos * delimiters until we either run out of pattern or need to parse more pattern.
748 1.39 christos */
749 1.39 christos static bool
750 1.39 christos p_branch_do(struct parse *p, struct branchc *bc)
751 1.39 christos {
752 1.39 christos int ate = 0;
753 1.39 christos
754 1.39 christos ate = p_branch_eat_delim(p, bc);
755 1.39 christos if (ate == 0)
756 1.39 christos return (false);
757 1.39 christos else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
758 1.39 christos /*
759 1.39 christos * Halt parsing only if we have an empty branch and p_branch_empty
760 1.39 christos * indicates that we must not continue. In the future, this will not
761 1.39 christos * necessarily be an error.
762 1.39 christos */
763 1.39 christos return (false);
764 1.39 christos p_branch_ins_offset(p, bc);
765 1.39 christos
766 1.39 christos return (true);
767 1.39 christos }
768 1.30 christos
769 1.39 christos static void
770 1.39 christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
771 1.39 christos {
772 1.14 lukem
773 1.40 christos (void)bc;
774 1.39 christos /*
775 1.39 christos * Does not move cleanly into expression parser because of
776 1.39 christos * ordinary interpration of * at the beginning position of
777 1.39 christos * an expression.
778 1.39 christos */
779 1.1 jtc if (EAT('^')) {
780 1.1 jtc EMIT(OBOL, 0);
781 1.1 jtc p->g->iflags |= USEBOL;
782 1.1 jtc p->g->nbol++;
783 1.1 jtc }
784 1.39 christos }
785 1.39 christos
786 1.39 christos static void
787 1.39 christos p_bre_post_parse(struct parse *p, struct branchc *bc)
788 1.39 christos {
789 1.39 christos
790 1.39 christos /* Expression is terminating due to EOL token */
791 1.39 christos if (bc->terminate) {
792 1.1 jtc DROP(1);
793 1.1 jtc EMIT(OEOL, 0);
794 1.1 jtc p->g->iflags |= USEEOL;
795 1.1 jtc p->g->neol++;
796 1.1 jtc }
797 1.39 christos }
798 1.1 jtc
799 1.39 christos /*
800 1.39 christos - p_re - Top level parser, concatenation and BRE anchoring
801 1.39 christos == static void p_re(struct parse *p, int end1, int end2);
802 1.39 christos * Giving end1 as OUT essentially eliminates the end1/end2 check.
803 1.39 christos *
804 1.39 christos * This implementation is a bit of a kludge, in that a trailing $ is first
805 1.39 christos * taken as an ordinary character and then revised to be an anchor.
806 1.39 christos * The amount of lookahead needed to avoid this kludge is excessive.
807 1.39 christos */
808 1.39 christos static void
809 1.39 christos p_re(struct parse *p,
810 1.39 christos int end1, /* first terminating character */
811 1.39 christos int end2) /* second terminating character; ignored for EREs */
812 1.39 christos {
813 1.39 christos struct branchc bc;
814 1.39 christos
815 1.39 christos bc.nbranch = 0;
816 1.39 christos if (end1 == OUT && end2 == OUT)
817 1.39 christos bc.outer = true;
818 1.39 christos else
819 1.39 christos bc.outer = false;
820 1.39 christos #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
821 1.39 christos for (;;) {
822 1.39 christos bc.start = HERE();
823 1.39 christos bc.nchain = 0;
824 1.39 christos bc.terminate = false;
825 1.39 christos if (p->pre_parse != NULL)
826 1.39 christos p->pre_parse(p, &bc);
827 1.39 christos while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
828 1.39 christos bc.terminate = p->parse_expr(p, &bc);
829 1.39 christos ++bc.nchain;
830 1.39 christos }
831 1.39 christos if (p->post_parse != NULL)
832 1.39 christos p->post_parse(p, &bc);
833 1.39 christos (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
834 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
835 1.39 christos if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
836 1.39 christos break;
837 1.39 christos #endif
838 1.39 christos if (!p->allowbranch)
839 1.39 christos break;
840 1.39 christos /*
841 1.39 christos * p_branch_do's return value indicates whether we should
842 1.39 christos * continue parsing or not. This is both for correctness and
843 1.39 christos * a slight optimization, because it will check if we've
844 1.39 christos * encountered an empty branch or the end of the string
845 1.39 christos * immediately following a branch delimiter.
846 1.39 christos */
847 1.39 christos if (!p_branch_do(p, &bc))
848 1.39 christos break;
849 1.39 christos }
850 1.39 christos #undef SEE_END
851 1.39 christos if (p->allowbranch)
852 1.39 christos p_branch_fix_tail(p, &bc);
853 1.39 christos assert(!MORE() || SEE(end1));
854 1.1 jtc }
855 1.1 jtc
856 1.1 jtc /*
857 1.1 jtc - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
858 1.39 christos == static bool p_simp_re(struct parse *p, struct branchc *bc);
859 1.1 jtc */
860 1.39 christos static bool /* was the simple RE an unbackslashed $? */
861 1.39 christos p_simp_re(struct parse *p, struct branchc *bc)
862 1.1 jtc {
863 1.9 perry int c;
864 1.39 christos int cc; /* convenient/control character */
865 1.9 perry int count;
866 1.9 perry int count2;
867 1.39 christos sopno pos;
868 1.39 christos bool handled;
869 1.40 christos size_t i;
870 1.39 christos wint_t wc;
871 1.9 perry sopno subno;
872 1.1 jtc # define BACKSL (1<<CHAR_BIT)
873 1.1 jtc
874 1.39 christos pos = HERE(); /* repetition op, if any, covers from here */
875 1.39 christos handled = false;
876 1.1 jtc
877 1.1 jtc assert(MORE()); /* caller should have ensured this */
878 1.1 jtc c = GETNEXT();
879 1.1 jtc if (c == '\\') {
880 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
881 1.39 christos cc = GETNEXT();
882 1.39 christos c = BACKSL | cc;
883 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
884 1.39 christos if (p->gnuext) {
885 1.39 christos handled = true;
886 1.39 christos switch (c) {
887 1.39 christos case BACKSL|'`':
888 1.39 christos EMIT(OBOS, 0);
889 1.39 christos break;
890 1.39 christos case BACKSL|'\'':
891 1.39 christos EMIT(OEOS, 0);
892 1.39 christos break;
893 1.39 christos case BACKSL|'B':
894 1.39 christos EMIT(ONWBND, 0);
895 1.39 christos break;
896 1.39 christos case BACKSL|'b':
897 1.39 christos EMIT(OWBND, 0);
898 1.39 christos break;
899 1.39 christos case BACKSL|'W':
900 1.39 christos case BACKSL|'w':
901 1.39 christos case BACKSL|'S':
902 1.39 christos case BACKSL|'s':
903 1.39 christos p_b_pseudoclass(p, cc);
904 1.39 christos break;
905 1.39 christos default:
906 1.39 christos handled = false;
907 1.39 christos }
908 1.39 christos }
909 1.39 christos #endif
910 1.1 jtc }
911 1.39 christos if (!handled) {
912 1.39 christos switch (c) {
913 1.39 christos case '.':
914 1.39 christos if (p->g->cflags®_NEWLINE)
915 1.39 christos nonnewline(p);
916 1.39 christos else
917 1.39 christos EMIT(OANY, 0);
918 1.39 christos break;
919 1.39 christos case '[':
920 1.39 christos p_bracket(p);
921 1.39 christos break;
922 1.39 christos case BACKSL|'<':
923 1.39 christos EMIT(OBOW, 0);
924 1.39 christos break;
925 1.39 christos case BACKSL|'>':
926 1.39 christos EMIT(OEOW, 0);
927 1.39 christos break;
928 1.39 christos case BACKSL|'{':
929 1.39 christos SETERROR(REG_BADRPT);
930 1.39 christos break;
931 1.39 christos case BACKSL|'(':
932 1.39 christos p->g->nsub++;
933 1.40 christos subno = (sopno)p->g->nsub;
934 1.39 christos if (subno < NPAREN)
935 1.39 christos p->pbegin[subno] = HERE();
936 1.39 christos EMIT(OLPAREN, subno);
937 1.39 christos /* the MORE here is an error heuristic */
938 1.39 christos if (MORE() && !SEETWO('\\', ')'))
939 1.39 christos p_re(p, '\\', ')');
940 1.39 christos if (subno < NPAREN) {
941 1.39 christos p->pend[subno] = HERE();
942 1.39 christos assert(p->pend[subno] != 0);
943 1.39 christos }
944 1.39 christos EMIT(ORPAREN, subno);
945 1.39 christos (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
946 1.39 christos break;
947 1.39 christos case BACKSL|')': /* should not get here -- must be user */
948 1.39 christos SETERROR(REG_EPAREN);
949 1.39 christos break;
950 1.39 christos case BACKSL|'1':
951 1.39 christos case BACKSL|'2':
952 1.39 christos case BACKSL|'3':
953 1.39 christos case BACKSL|'4':
954 1.39 christos case BACKSL|'5':
955 1.39 christos case BACKSL|'6':
956 1.39 christos case BACKSL|'7':
957 1.39 christos case BACKSL|'8':
958 1.39 christos case BACKSL|'9':
959 1.39 christos i = (c&~BACKSL) - '0';
960 1.39 christos assert(i < NPAREN);
961 1.39 christos if (p->pend[i] != 0) {
962 1.39 christos assert(i <= p->g->nsub);
963 1.39 christos EMIT(OBACK_, i);
964 1.39 christos assert(p->pbegin[i] != 0);
965 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
966 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
967 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
968 1.39 christos EMIT(O_BACK, i);
969 1.39 christos } else
970 1.39 christos SETERROR(REG_ESUBREG);
971 1.39 christos p->g->backrefs = 1;
972 1.39 christos break;
973 1.39 christos case '*':
974 1.39 christos /*
975 1.39 christos * Ordinary if used as the first character beyond BOL anchor of
976 1.39 christos * a (sub-)expression, counts as a bad repetition operator if it
977 1.39 christos * appears otherwise.
978 1.39 christos */
979 1.39 christos (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
980 1.39 christos /* FALLTHROUGH */
981 1.39 christos default:
982 1.39 christos if (p->error != 0)
983 1.39 christos return (false); /* Definitely not $... */
984 1.39 christos p->next--;
985 1.39 christos wc = WGETNEXT();
986 1.39 christos if ((c & BACKSL) == 0 || may_escape(p, wc))
987 1.39 christos ordinary(p, wc);
988 1.39 christos else
989 1.39 christos SETERROR(REG_EESCAPE);
990 1.39 christos break;
991 1.1 jtc }
992 1.1 jtc }
993 1.1 jtc
994 1.1 jtc if (EAT('*')) { /* implemented as +? */
995 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
996 1.1 jtc INSERT(OPLUS_, pos);
997 1.1 jtc ASTERN(O_PLUS, pos);
998 1.1 jtc INSERT(OQUEST_, pos);
999 1.1 jtc ASTERN(O_QUEST, pos);
1000 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1001 1.39 christos } else if (p->gnuext && EATTWO('\\', '?')) {
1002 1.39 christos INSERT(OQUEST_, pos);
1003 1.39 christos ASTERN(O_QUEST, pos);
1004 1.39 christos } else if (p->gnuext && EATTWO('\\', '+')) {
1005 1.39 christos INSERT(OPLUS_, pos);
1006 1.39 christos ASTERN(O_PLUS, pos);
1007 1.39 christos #endif
1008 1.1 jtc } else if (EATTWO('\\', '{')) {
1009 1.1 jtc count = p_count(p);
1010 1.1 jtc if (EAT(',')) {
1011 1.39 christos if (MORE() && isdigit((uch)PEEK())) {
1012 1.1 jtc count2 = p_count(p);
1013 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
1014 1.1 jtc } else /* single number with comma */
1015 1.1 jtc count2 = INFINITY;
1016 1.1 jtc } else /* just a single number */
1017 1.1 jtc count2 = count;
1018 1.39 christos repeat(p, pos, count, count2);
1019 1.1 jtc if (!EATTWO('\\', '}')) { /* error heuristics */
1020 1.1 jtc while (MORE() && !SEETWO('\\', '}'))
1021 1.1 jtc NEXT();
1022 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
1023 1.1 jtc SETERROR(REG_BADBR);
1024 1.1 jtc }
1025 1.39 christos } else if (c == '$') /* $ (but not \$) ends it */
1026 1.39 christos return (true);
1027 1.1 jtc
1028 1.39 christos return (false);
1029 1.1 jtc }
1030 1.1 jtc
1031 1.1 jtc /*
1032 1.1 jtc - p_count - parse a repetition count
1033 1.9 perry == static int p_count(struct parse *p);
1034 1.1 jtc */
1035 1.1 jtc static int /* the value */
1036 1.39 christos p_count(struct parse *p)
1037 1.1 jtc {
1038 1.9 perry int count = 0;
1039 1.9 perry int ndigits = 0;
1040 1.1 jtc
1041 1.39 christos while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1042 1.1 jtc count = count*10 + (GETNEXT() - '0');
1043 1.1 jtc ndigits++;
1044 1.1 jtc }
1045 1.1 jtc
1046 1.39 christos (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1047 1.1 jtc return(count);
1048 1.1 jtc }
1049 1.1 jtc
1050 1.1 jtc /*
1051 1.1 jtc - p_bracket - parse a bracketed character list
1052 1.9 perry == static void p_bracket(struct parse *p);
1053 1.1 jtc */
1054 1.1 jtc static void
1055 1.39 christos p_bracket(struct parse *p)
1056 1.1 jtc {
1057 1.14 lukem cset *cs;
1058 1.39 christos wint_t ch;
1059 1.14 lukem
1060 1.1 jtc /* Dept of Truly Sickening Special-Case Kludges */
1061 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1062 1.1 jtc EMIT(OBOW, 0);
1063 1.1 jtc NEXTn(6);
1064 1.1 jtc return;
1065 1.1 jtc }
1066 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1067 1.1 jtc EMIT(OEOW, 0);
1068 1.1 jtc NEXTn(6);
1069 1.1 jtc return;
1070 1.1 jtc }
1071 1.1 jtc
1072 1.39 christos if ((cs = allocset(p)) == NULL)
1073 1.39 christos return;
1074 1.39 christos
1075 1.39 christos if (p->g->cflags®_ICASE)
1076 1.39 christos cs->icase = 1;
1077 1.1 jtc if (EAT('^'))
1078 1.39 christos cs->invert = 1;
1079 1.1 jtc if (EAT(']'))
1080 1.39 christos CHadd(p, cs, ']');
1081 1.1 jtc else if (EAT('-'))
1082 1.39 christos CHadd(p, cs, '-');
1083 1.1 jtc while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1084 1.1 jtc p_b_term(p, cs);
1085 1.1 jtc if (EAT('-'))
1086 1.39 christos CHadd(p, cs, '-');
1087 1.39 christos (void)MUSTEAT(']', REG_EBRACK);
1088 1.1 jtc
1089 1.1 jtc if (p->error != 0) /* don't mess things up further */
1090 1.1 jtc return;
1091 1.1 jtc
1092 1.39 christos if (cs->invert && p->g->cflags®_NEWLINE)
1093 1.39 christos cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1094 1.1 jtc
1095 1.39 christos if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1096 1.39 christos ordinary(p, ch);
1097 1.39 christos freeset(p, cs);
1098 1.39 christos } else
1099 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1100 1.39 christos }
1101 1.1 jtc
1102 1.39 christos static int
1103 1.39 christos p_range_cmp(wchar_t c1, wchar_t c2)
1104 1.39 christos {
1105 1.39 christos #ifdef REGEX_LIBC_COLLATE
1106 1.39 christos return __wcollate_range_cmp(c1, c2);
1107 1.42 christos #elif defined(NLS)
1108 1.39 christos /* Copied from libc/collate __wcollate_range_cmp */
1109 1.39 christos wchar_t s1[2], s2[2];
1110 1.1 jtc
1111 1.39 christos s1[0] = c1;
1112 1.39 christos s1[1] = L'\0';
1113 1.39 christos s2[0] = c2;
1114 1.39 christos s2[1] = L'\0';
1115 1.42 christos return wcscoll(s1, s2);
1116 1.42 christos #else
1117 1.42 christos char s1[2], s2[2];
1118 1.42 christos
1119 1.42 christos s1[0] = (char)c1;
1120 1.42 christos s1[1] = '\0';
1121 1.42 christos s2[0] = (char)c2;
1122 1.42 christos s2[1] = '\0';
1123 1.42 christos return strcoll(s1, s2);
1124 1.39 christos #endif
1125 1.1 jtc }
1126 1.1 jtc
1127 1.1 jtc /*
1128 1.1 jtc - p_b_term - parse one term of a bracketed character list
1129 1.9 perry == static void p_b_term(struct parse *p, cset *cs);
1130 1.1 jtc */
1131 1.1 jtc static void
1132 1.39 christos p_b_term(struct parse *p, cset *cs)
1133 1.1 jtc {
1134 1.9 perry char c;
1135 1.39 christos wint_t start, finish;
1136 1.39 christos wint_t i;
1137 1.39 christos #ifdef REGEX_LIBC_COLLATE
1138 1.39 christos struct xlocale_collate *table =
1139 1.39 christos (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1140 1.39 christos #endif
1141 1.1 jtc
1142 1.14 lukem _DIAGASSERT(p != NULL);
1143 1.14 lukem _DIAGASSERT(cs != NULL);
1144 1.14 lukem
1145 1.1 jtc /* classify what we've got */
1146 1.1 jtc switch ((MORE()) ? PEEK() : '\0') {
1147 1.1 jtc case '[':
1148 1.1 jtc c = (MORE2()) ? PEEK2() : '\0';
1149 1.1 jtc break;
1150 1.1 jtc case '-':
1151 1.1 jtc SETERROR(REG_ERANGE);
1152 1.1 jtc return; /* NOTE RETURN */
1153 1.1 jtc default:
1154 1.1 jtc c = '\0';
1155 1.1 jtc break;
1156 1.1 jtc }
1157 1.1 jtc
1158 1.1 jtc switch (c) {
1159 1.1 jtc case ':': /* character class */
1160 1.1 jtc NEXT2();
1161 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1162 1.1 jtc c = PEEK();
1163 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1164 1.1 jtc p_b_cclass(p, cs);
1165 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1166 1.39 christos (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1167 1.1 jtc break;
1168 1.1 jtc case '=': /* equivalence class */
1169 1.1 jtc NEXT2();
1170 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1171 1.1 jtc c = PEEK();
1172 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1173 1.1 jtc p_b_eclass(p, cs);
1174 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1175 1.39 christos (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1176 1.1 jtc break;
1177 1.1 jtc default: /* symbol, ordinary character, or range */
1178 1.1 jtc start = p_b_symbol(p);
1179 1.1 jtc if (SEE('-') && MORE2() && PEEK2() != ']') {
1180 1.1 jtc /* range */
1181 1.1 jtc NEXT();
1182 1.1 jtc if (EAT('-'))
1183 1.1 jtc finish = '-';
1184 1.1 jtc else
1185 1.1 jtc finish = p_b_symbol(p);
1186 1.1 jtc } else
1187 1.1 jtc finish = start;
1188 1.39 christos if (start == finish)
1189 1.39 christos CHadd(p, cs, start);
1190 1.39 christos else {
1191 1.39 christos #ifdef REGEX_LIBC_COLLATE
1192 1.39 christos if (table->__collate_load_error || MB_CUR_MAX > 1) {
1193 1.39 christos #else
1194 1.39 christos if (MB_CUR_MAX > 1) {
1195 1.39 christos #endif
1196 1.39 christos (void)REQUIRE(start <= finish, REG_ERANGE);
1197 1.39 christos CHaddrange(p, cs, start, finish);
1198 1.39 christos } else {
1199 1.39 christos (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1200 1.39 christos for (i = 0; i <= UCHAR_MAX; i++) {
1201 1.39 christos if (p_range_cmp(start, i) <= 0 &&
1202 1.39 christos p_range_cmp(i, finish) <= 0 )
1203 1.39 christos CHadd(p, cs, i);
1204 1.39 christos }
1205 1.39 christos }
1206 1.39 christos }
1207 1.39 christos break;
1208 1.39 christos }
1209 1.39 christos }
1210 1.39 christos
1211 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1212 1.39 christos /*
1213 1.39 christos - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1214 1.39 christos == static int p_b_pseudoclass(struct parse *p, char c)
1215 1.39 christos */
1216 1.39 christos static int
1217 1.39 christos p_b_pseudoclass(struct parse *p, char c) {
1218 1.39 christos cset *cs;
1219 1.39 christos
1220 1.39 christos if ((cs = allocset(p)) == NULL)
1221 1.39 christos return(0);
1222 1.39 christos
1223 1.39 christos if (p->g->cflags®_ICASE)
1224 1.39 christos cs->icase = 1;
1225 1.39 christos
1226 1.39 christos switch (c) {
1227 1.39 christos case 'W':
1228 1.39 christos cs->invert = 1;
1229 1.39 christos /* FALLTHROUGH */
1230 1.39 christos case 'w':
1231 1.39 christos p_b_cclass_named(p, cs, "alnum");
1232 1.1 jtc break;
1233 1.39 christos case 'S':
1234 1.39 christos cs->invert = 1;
1235 1.39 christos /* FALLTHROUGH */
1236 1.39 christos case 's':
1237 1.39 christos p_b_cclass_named(p, cs, "space");
1238 1.39 christos break;
1239 1.39 christos default:
1240 1.39 christos return(0);
1241 1.1 jtc }
1242 1.39 christos
1243 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1244 1.39 christos return(1);
1245 1.1 jtc }
1246 1.39 christos #endif
1247 1.1 jtc
1248 1.1 jtc /*
1249 1.1 jtc - p_b_cclass - parse a character-class name and deal with it
1250 1.9 perry == static void p_b_cclass(struct parse *p, cset *cs);
1251 1.1 jtc */
1252 1.1 jtc static void
1253 1.39 christos p_b_cclass(struct parse *p, cset *cs)
1254 1.1 jtc {
1255 1.39 christos const char *sp = p->next;
1256 1.9 perry size_t len;
1257 1.39 christos char clname[16];
1258 1.1 jtc
1259 1.39 christos while (MORE() && isalpha((uch)PEEK()))
1260 1.1 jtc NEXT();
1261 1.1 jtc len = p->next - sp;
1262 1.39 christos if (len >= sizeof(clname) - 1) {
1263 1.1 jtc SETERROR(REG_ECTYPE);
1264 1.1 jtc return;
1265 1.1 jtc }
1266 1.39 christos memcpy(clname, sp, len);
1267 1.39 christos clname[len] = '\0';
1268 1.39 christos
1269 1.39 christos p_b_cclass_named(p, cs, clname);
1270 1.39 christos }
1271 1.42 christos
1272 1.39 christos /*
1273 1.39 christos - p_b_cclass_named - deal with a named character class
1274 1.39 christos == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1275 1.39 christos */
1276 1.39 christos static void
1277 1.39 christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1278 1.39 christos wctype_t wct;
1279 1.1 jtc
1280 1.39 christos if ((wct = wctype(clname)) == 0) {
1281 1.39 christos SETERROR(REG_ECTYPE);
1282 1.39 christos return;
1283 1.39 christos }
1284 1.39 christos CHaddtype(p, cs, wct);
1285 1.1 jtc }
1286 1.1 jtc
1287 1.1 jtc /*
1288 1.1 jtc - p_b_eclass - parse an equivalence-class name and deal with it
1289 1.9 perry == static void p_b_eclass(struct parse *p, cset *cs);
1290 1.1 jtc *
1291 1.1 jtc * This implementation is incomplete. xxx
1292 1.1 jtc */
1293 1.1 jtc static void
1294 1.39 christos p_b_eclass(struct parse *p, cset *cs)
1295 1.1 jtc {
1296 1.39 christos wint_t c;
1297 1.1 jtc
1298 1.14 lukem _DIAGASSERT(p != NULL);
1299 1.14 lukem _DIAGASSERT(cs != NULL);
1300 1.14 lukem
1301 1.1 jtc c = p_b_coll_elem(p, '=');
1302 1.39 christos CHadd(p, cs, c);
1303 1.1 jtc }
1304 1.1 jtc
1305 1.1 jtc /*
1306 1.1 jtc - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1307 1.39 christos == static wint_t p_b_symbol(struct parse *p);
1308 1.1 jtc */
1309 1.39 christos static wint_t /* value of symbol */
1310 1.39 christos p_b_symbol(struct parse *p)
1311 1.1 jtc {
1312 1.39 christos wint_t value;
1313 1.1 jtc
1314 1.14 lukem _DIAGASSERT(p != NULL);
1315 1.14 lukem
1316 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1317 1.1 jtc if (!EATTWO('[', '.'))
1318 1.39 christos return(WGETNEXT());
1319 1.1 jtc
1320 1.1 jtc /* collating symbol */
1321 1.1 jtc value = p_b_coll_elem(p, '.');
1322 1.39 christos (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1323 1.1 jtc return(value);
1324 1.1 jtc }
1325 1.1 jtc
1326 1.1 jtc /*
1327 1.1 jtc - p_b_coll_elem - parse a collating-element name and look it up
1328 1.39 christos == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1329 1.1 jtc */
1330 1.39 christos static wint_t /* value of collating element */
1331 1.39 christos p_b_coll_elem(struct parse *p,
1332 1.39 christos wint_t endc) /* name ended by endc,']' */
1333 1.39 christos {
1334 1.39 christos const char *sp = p->next;
1335 1.39 christos struct cname *cp;
1336 1.42 christos size_t len;
1337 1.1 jtc
1338 1.14 lukem _DIAGASSERT(p != NULL);
1339 1.14 lukem
1340 1.1 jtc while (MORE() && !SEETWO(endc, ']'))
1341 1.1 jtc NEXT();
1342 1.1 jtc if (!MORE()) {
1343 1.1 jtc SETERROR(REG_EBRACK);
1344 1.1 jtc return(0);
1345 1.1 jtc }
1346 1.1 jtc len = p->next - sp;
1347 1.1 jtc for (cp = cnames; cp->name != NULL; cp++)
1348 1.37 christos if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1349 1.1 jtc return(cp->code); /* known name */
1350 1.42 christos #ifdef NLS
1351 1.42 christos mbstate_t mbs;
1352 1.42 christos wchar_t wc;
1353 1.42 christos size_t clen;
1354 1.42 christos
1355 1.39 christos memset(&mbs, 0, sizeof(mbs));
1356 1.39 christos if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1357 1.39 christos return (wc); /* single character */
1358 1.39 christos else if (clen == (size_t)-1 || clen == (size_t)-2)
1359 1.39 christos SETERROR(REG_ILLSEQ);
1360 1.39 christos else
1361 1.39 christos SETERROR(REG_ECOLLATE); /* neither */
1362 1.1 jtc return(0);
1363 1.42 christos #else
1364 1.42 christos if (len == 1)
1365 1.42 christos return *sp; /* single character */
1366 1.42 christos SETERROR(REG_ECOLLATE); /* neither */
1367 1.42 christos return 0;
1368 1.42 christos #endif
1369 1.1 jtc }
1370 1.1 jtc
1371 1.1 jtc /*
1372 1.39 christos - may_escape - determine whether 'ch' is escape-able in the current context
1373 1.39 christos == static int may_escape(struct parse *p, const wint_t ch)
1374 1.39 christos */
1375 1.39 christos static bool
1376 1.39 christos may_escape(struct parse *p, const wint_t ch)
1377 1.39 christos {
1378 1.39 christos
1379 1.39 christos if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1380 1.39 christos return (true);
1381 1.39 christos if (isalpha(ch) || ch == '\'' || ch == '`')
1382 1.39 christos return (false);
1383 1.39 christos return (true);
1384 1.39 christos #ifdef NOTYET
1385 1.39 christos /*
1386 1.39 christos * Build a whitelist of characters that may be escaped to produce an
1387 1.39 christos * ordinary in the current context. This assumes that these have not
1388 1.39 christos * been otherwise interpreted as a special character. Escaping an
1389 1.39 christos * ordinary character yields undefined results according to
1390 1.39 christos * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1391 1.39 christos * advantage of this and use escaped ordinary characters to provide
1392 1.39 christos * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1393 1.39 christos */
1394 1.39 christos switch(ch) {
1395 1.39 christos case '|':
1396 1.39 christos case '+':
1397 1.39 christos case '?':
1398 1.39 christos /* The above characters may not be escaped in BREs */
1399 1.39 christos if (!(p->g->cflags®_EXTENDED))
1400 1.39 christos return (false);
1401 1.39 christos /* Fallthrough */
1402 1.39 christos case '(':
1403 1.39 christos case ')':
1404 1.39 christos case '{':
1405 1.39 christos case '}':
1406 1.39 christos case '.':
1407 1.39 christos case '[':
1408 1.39 christos case ']':
1409 1.39 christos case '\\':
1410 1.39 christos case '*':
1411 1.39 christos case '^':
1412 1.39 christos case '$':
1413 1.39 christos return (true);
1414 1.39 christos default:
1415 1.39 christos return (false);
1416 1.39 christos }
1417 1.39 christos #endif
1418 1.39 christos }
1419 1.39 christos
1420 1.39 christos /*
1421 1.1 jtc - othercase - return the case counterpart of an alphabetic
1422 1.39 christos == static wint_t othercase(wint_t ch);
1423 1.1 jtc */
1424 1.39 christos static wint_t /* if no counterpart, return ch */
1425 1.39 christos othercase(wint_t ch)
1426 1.39 christos {
1427 1.39 christos assert(iswalpha(ch));
1428 1.39 christos if (iswupper(ch))
1429 1.39 christos return(towlower(ch));
1430 1.39 christos else if (iswlower(ch))
1431 1.39 christos return(towupper(ch));
1432 1.1 jtc else /* peculiar, but could happen */
1433 1.1 jtc return(ch);
1434 1.1 jtc }
1435 1.1 jtc
1436 1.1 jtc /*
1437 1.1 jtc - bothcases - emit a dualcase version of a two-case character
1438 1.39 christos == static void bothcases(struct parse *p, wint_t ch);
1439 1.1 jtc *
1440 1.1 jtc * Boy, is this implementation ever a kludge...
1441 1.1 jtc */
1442 1.1 jtc static void
1443 1.39 christos bothcases(struct parse *p, wint_t ch)
1444 1.1 jtc {
1445 1.39 christos const char *oldnext = p->next;
1446 1.39 christos const char *oldend = p->end;
1447 1.39 christos char bracket[3 + MB_LEN_MAX];
1448 1.39 christos size_t n;
1449 1.1 jtc
1450 1.14 lukem _DIAGASSERT(p != NULL);
1451 1.14 lukem
1452 1.1 jtc assert(othercase(ch) != ch); /* p_bracket() would recurse */
1453 1.1 jtc p->next = bracket;
1454 1.42 christos #ifdef NLS
1455 1.42 christos mbstate_t mbs;
1456 1.39 christos memset(&mbs, 0, sizeof(mbs));
1457 1.39 christos n = wcrtomb(bracket, ch, &mbs);
1458 1.39 christos assert(n != (size_t)-1);
1459 1.42 christos #else
1460 1.42 christos n = 0;
1461 1.42 christos bracket[n++] = ch;
1462 1.42 christos #endif
1463 1.39 christos bracket[n] = ']';
1464 1.39 christos bracket[n + 1] = '\0';
1465 1.39 christos p->end = bracket+n+1;
1466 1.1 jtc p_bracket(p);
1467 1.39 christos assert(p->next == p->end);
1468 1.1 jtc p->next = oldnext;
1469 1.1 jtc p->end = oldend;
1470 1.1 jtc }
1471 1.1 jtc
1472 1.1 jtc /*
1473 1.1 jtc - ordinary - emit an ordinary character
1474 1.39 christos == static void ordinary(struct parse *p, wint_t ch);
1475 1.1 jtc */
1476 1.1 jtc static void
1477 1.39 christos ordinary(struct parse *p, wint_t ch)
1478 1.1 jtc {
1479 1.39 christos cset *cs;
1480 1.1 jtc
1481 1.14 lukem _DIAGASSERT(p != NULL);
1482 1.14 lukem
1483 1.39 christos if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1484 1.39 christos bothcases(p, ch);
1485 1.40 christos else if ((wint_t)(ch & OPDMASK) == ch)
1486 1.40 christos EMIT(OCHAR, (size_t)ch);
1487 1.1 jtc else {
1488 1.39 christos /*
1489 1.39 christos * Kludge: character is too big to fit into an OCHAR operand.
1490 1.39 christos * Emit a singleton set.
1491 1.39 christos */
1492 1.39 christos if ((cs = allocset(p)) == NULL)
1493 1.39 christos return;
1494 1.39 christos CHadd(p, cs, ch);
1495 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1496 1.1 jtc }
1497 1.1 jtc }
1498 1.1 jtc
1499 1.1 jtc /*
1500 1.1 jtc - nonnewline - emit REG_NEWLINE version of OANY
1501 1.9 perry == static void nonnewline(struct parse *p);
1502 1.1 jtc *
1503 1.1 jtc * Boy, is this implementation ever a kludge...
1504 1.1 jtc */
1505 1.1 jtc static void
1506 1.39 christos nonnewline(struct parse *p)
1507 1.1 jtc {
1508 1.39 christos const char *oldnext = p->next;
1509 1.39 christos const char *oldend = p->end;
1510 1.1 jtc char bracket[4];
1511 1.1 jtc
1512 1.14 lukem _DIAGASSERT(p != NULL);
1513 1.14 lukem
1514 1.1 jtc p->next = bracket;
1515 1.1 jtc p->end = bracket+3;
1516 1.1 jtc bracket[0] = '^';
1517 1.1 jtc bracket[1] = '\n';
1518 1.1 jtc bracket[2] = ']';
1519 1.1 jtc bracket[3] = '\0';
1520 1.1 jtc p_bracket(p);
1521 1.1 jtc assert(p->next == bracket+3);
1522 1.1 jtc p->next = oldnext;
1523 1.1 jtc p->end = oldend;
1524 1.1 jtc }
1525 1.1 jtc
1526 1.1 jtc /*
1527 1.1 jtc - repeat - generate code for a bounded repetition, recursively if needed
1528 1.39 christos == static void repeat(struct parse *p, sopno start, int from, int to);
1529 1.1 jtc */
1530 1.1 jtc static void
1531 1.39 christos repeat(struct parse *p,
1532 1.39 christos sopno start, /* operand from here to end of strip */
1533 1.39 christos int from, /* repeated from this number */
1534 1.39 christos int to) /* to this number of times (maybe INFINITY) */
1535 1.1 jtc {
1536 1.39 christos sopno finish = HERE();
1537 1.1 jtc # define N 2
1538 1.1 jtc # define INF 3
1539 1.1 jtc # define REP(f, t) ((f)*8 + (t))
1540 1.1 jtc # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1541 1.9 perry sopno copy;
1542 1.1 jtc
1543 1.14 lukem _DIAGASSERT(p != NULL);
1544 1.14 lukem
1545 1.39 christos if (p->error != 0) /* head off possible runaway recursion */
1546 1.30 christos return;
1547 1.30 christos
1548 1.1 jtc assert(from <= to);
1549 1.1 jtc
1550 1.1 jtc switch (REP(MAP(from), MAP(to))) {
1551 1.1 jtc case REP(0, 0): /* must be user doing this */
1552 1.1 jtc DROP(finish-start); /* drop the operand */
1553 1.1 jtc break;
1554 1.1 jtc case REP(0, 1): /* as x{1,1}? */
1555 1.1 jtc case REP(0, N): /* as x{1,n}? */
1556 1.1 jtc case REP(0, INF): /* as x{1,}? */
1557 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1558 1.4 jtc INSERT(OCH_, start); /* offset is wrong... */
1559 1.39 christos repeat(p, start+1, 1, to);
1560 1.4 jtc ASTERN(OOR1, start);
1561 1.1 jtc AHEAD(start); /* ... fix it */
1562 1.4 jtc EMIT(OOR2, 0);
1563 1.4 jtc AHEAD(THERE());
1564 1.4 jtc ASTERN(O_CH, THERETHERE());
1565 1.1 jtc break;
1566 1.1 jtc case REP(1, 1): /* trivial case */
1567 1.1 jtc /* done */
1568 1.1 jtc break;
1569 1.1 jtc case REP(1, N): /* as x?x{1,n-1} */
1570 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1571 1.4 jtc INSERT(OCH_, start);
1572 1.4 jtc ASTERN(OOR1, start);
1573 1.4 jtc AHEAD(start);
1574 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
1575 1.4 jtc AHEAD(THERE()); /* ...so fix it */
1576 1.4 jtc ASTERN(O_CH, THERETHERE());
1577 1.1 jtc copy = dupl(p, start+1, finish+1);
1578 1.4 jtc assert(copy == finish+4);
1579 1.39 christos repeat(p, copy, 1, to-1);
1580 1.1 jtc break;
1581 1.1 jtc case REP(1, INF): /* as x+ */
1582 1.1 jtc INSERT(OPLUS_, start);
1583 1.1 jtc ASTERN(O_PLUS, start);
1584 1.1 jtc break;
1585 1.1 jtc case REP(N, N): /* as xx{m-1,n-1} */
1586 1.1 jtc copy = dupl(p, start, finish);
1587 1.39 christos repeat(p, copy, from-1, to-1);
1588 1.1 jtc break;
1589 1.1 jtc case REP(N, INF): /* as xx{n-1,INF} */
1590 1.1 jtc copy = dupl(p, start, finish);
1591 1.39 christos repeat(p, copy, from-1, to);
1592 1.1 jtc break;
1593 1.1 jtc default: /* "can't happen" */
1594 1.1 jtc SETERROR(REG_ASSERT); /* just in case */
1595 1.1 jtc break;
1596 1.1 jtc }
1597 1.1 jtc }
1598 1.1 jtc
1599 1.1 jtc /*
1600 1.39 christos - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1601 1.39 christos - character from the parse struct, signals a REG_ILLSEQ error if the
1602 1.39 christos - character can't be converted. Returns the number of bytes consumed.
1603 1.39 christos */
1604 1.39 christos static wint_t
1605 1.39 christos wgetnext(struct parse *p)
1606 1.39 christos {
1607 1.42 christos #ifdef NLS
1608 1.39 christos mbstate_t mbs;
1609 1.39 christos wchar_t wc;
1610 1.39 christos size_t n;
1611 1.39 christos
1612 1.39 christos memset(&mbs, 0, sizeof(mbs));
1613 1.40 christos n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1614 1.39 christos if (n == (size_t)-1 || n == (size_t)-2) {
1615 1.39 christos SETERROR(REG_ILLSEQ);
1616 1.39 christos return (0);
1617 1.39 christos }
1618 1.39 christos if (n == 0)
1619 1.39 christos n = 1;
1620 1.39 christos p->next += n;
1621 1.42 christos return wc;
1622 1.42 christos #else
1623 1.42 christos return *p->next++;
1624 1.42 christos #endif
1625 1.39 christos }
1626 1.39 christos
1627 1.39 christos /*
1628 1.1 jtc - seterr - set an error condition
1629 1.9 perry == static int seterr(struct parse *p, int e);
1630 1.1 jtc */
1631 1.1 jtc static int /* useless but makes type checking happy */
1632 1.39 christos seterr(struct parse *p, int e)
1633 1.1 jtc {
1634 1.14 lukem
1635 1.14 lukem _DIAGASSERT(p != NULL);
1636 1.14 lukem
1637 1.1 jtc if (p->error == 0) /* keep earliest error condition */
1638 1.1 jtc p->error = e;
1639 1.1 jtc p->next = nuls; /* try to bring things to a halt */
1640 1.1 jtc p->end = nuls;
1641 1.1 jtc return(0); /* make the return value well-defined */
1642 1.1 jtc }
1643 1.1 jtc
1644 1.1 jtc /*
1645 1.1 jtc - allocset - allocate a set of characters for []
1646 1.9 perry == static cset *allocset(struct parse *p);
1647 1.1 jtc */
1648 1.1 jtc static cset *
1649 1.39 christos allocset(struct parse *p)
1650 1.1 jtc {
1651 1.39 christos cset *cs, *ncs;
1652 1.1 jtc
1653 1.14 lukem _DIAGASSERT(p != NULL);
1654 1.14 lukem
1655 1.39 christos ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1656 1.39 christos if (ncs == NULL) {
1657 1.39 christos SETERROR(REG_ESPACE);
1658 1.39 christos return (NULL);
1659 1.1 jtc }
1660 1.39 christos p->g->sets = ncs;
1661 1.39 christos cs = &p->g->sets[p->g->ncsets++];
1662 1.39 christos memset(cs, 0, sizeof(*cs));
1663 1.1 jtc
1664 1.1 jtc return(cs);
1665 1.1 jtc }
1666 1.1 jtc
1667 1.1 jtc /*
1668 1.1 jtc - freeset - free a now-unused set
1669 1.9 perry == static void freeset(struct parse *p, cset *cs);
1670 1.1 jtc */
1671 1.1 jtc static void
1672 1.39 christos freeset(struct parse *p, cset *cs)
1673 1.1 jtc {
1674 1.14 lukem cset *top;
1675 1.14 lukem
1676 1.14 lukem _DIAGASSERT(p != NULL);
1677 1.14 lukem _DIAGASSERT(cs != NULL);
1678 1.14 lukem
1679 1.14 lukem top = &p->g->sets[p->g->ncsets];
1680 1.1 jtc
1681 1.39 christos free(cs->wides);
1682 1.39 christos free(cs->ranges);
1683 1.39 christos free(cs->types);
1684 1.39 christos memset(cs, 0, sizeof(*cs));
1685 1.1 jtc if (cs == top-1) /* recover only the easy case */
1686 1.1 jtc p->g->ncsets--;
1687 1.1 jtc }
1688 1.1 jtc
1689 1.1 jtc /*
1690 1.39 christos - singleton - Determine whether a set contains only one character,
1691 1.39 christos - returning it if so, otherwise returning OUT.
1692 1.1 jtc */
1693 1.39 christos static wint_t
1694 1.39 christos singleton(cset *cs)
1695 1.1 jtc {
1696 1.39 christos wint_t i, s, n;
1697 1.1 jtc
1698 1.39 christos for (i = n = 0; i < NC; i++)
1699 1.39 christos if (CHIN(cs, i)) {
1700 1.1 jtc n++;
1701 1.39 christos s = i;
1702 1.39 christos }
1703 1.39 christos if (n == 1)
1704 1.39 christos return (s);
1705 1.39 christos if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1706 1.39 christos cs->icase == 0)
1707 1.39 christos return (cs->wides[0]);
1708 1.39 christos /* Don't bother handling the other cases. */
1709 1.39 christos return (OUT);
1710 1.1 jtc }
1711 1.1 jtc
1712 1.1 jtc /*
1713 1.39 christos - CHadd - add character to character set.
1714 1.1 jtc */
1715 1.1 jtc static void
1716 1.39 christos CHadd(struct parse *p, cset *cs, wint_t ch)
1717 1.1 jtc {
1718 1.39 christos wint_t nch, *newwides;
1719 1.14 lukem
1720 1.14 lukem _DIAGASSERT(p != NULL);
1721 1.14 lukem _DIAGASSERT(cs != NULL);
1722 1.14 lukem
1723 1.39 christos assert(ch >= 0);
1724 1.39 christos if (ch < NC)
1725 1.40 christos cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1726 1.39 christos else {
1727 1.39 christos newwides = reallocarray(cs->wides, cs->nwides + 1,
1728 1.39 christos sizeof(*cs->wides));
1729 1.39 christos if (newwides == NULL) {
1730 1.39 christos SETERROR(REG_ESPACE);
1731 1.39 christos return;
1732 1.39 christos }
1733 1.39 christos cs->wides = newwides;
1734 1.39 christos cs->wides[cs->nwides++] = ch;
1735 1.39 christos }
1736 1.39 christos if (cs->icase) {
1737 1.39 christos if ((nch = towlower(ch)) < NC)
1738 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1739 1.39 christos if ((nch = towupper(ch)) < NC)
1740 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1741 1.1 jtc }
1742 1.1 jtc }
1743 1.1 jtc
1744 1.1 jtc /*
1745 1.39 christos - CHaddrange - add all characters in the range [min,max] to a character set.
1746 1.1 jtc */
1747 1.1 jtc static void
1748 1.39 christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1749 1.1 jtc {
1750 1.39 christos crange *newranges;
1751 1.14 lukem
1752 1.39 christos _DIAGASSERT(p != NULL);
1753 1.14 lukem _DIAGASSERT(cs != NULL);
1754 1.14 lukem
1755 1.39 christos for (; min < NC && min <= max; min++)
1756 1.39 christos CHadd(p, cs, min);
1757 1.39 christos if (min >= max)
1758 1.39 christos return;
1759 1.39 christos newranges = reallocarray(cs->ranges, cs->nranges + 1,
1760 1.39 christos sizeof(*cs->ranges));
1761 1.39 christos if (newranges == NULL) {
1762 1.39 christos SETERROR(REG_ESPACE);
1763 1.1 jtc return;
1764 1.1 jtc }
1765 1.39 christos cs->ranges = newranges;
1766 1.39 christos cs->ranges[cs->nranges].min = min;
1767 1.39 christos cs->ranges[cs->nranges].max = max;
1768 1.39 christos cs->nranges++;
1769 1.1 jtc }
1770 1.1 jtc
1771 1.1 jtc /*
1772 1.39 christos - CHaddtype - add all characters of a certain type to a character set.
1773 1.1 jtc */
1774 1.1 jtc static void
1775 1.39 christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1776 1.1 jtc {
1777 1.39 christos wint_t i;
1778 1.39 christos wctype_t *newtypes;
1779 1.14 lukem
1780 1.14 lukem _DIAGASSERT(p != NULL);
1781 1.14 lukem _DIAGASSERT(cs != NULL);
1782 1.14 lukem
1783 1.39 christos for (i = 0; i < NC; i++)
1784 1.39 christos if (iswctype(i, wct))
1785 1.39 christos CHadd(p, cs, i);
1786 1.39 christos newtypes = reallocarray(cs->types, cs->ntypes + 1,
1787 1.39 christos sizeof(*cs->types));
1788 1.39 christos if (newtypes == NULL) {
1789 1.39 christos SETERROR(REG_ESPACE);
1790 1.1 jtc return;
1791 1.39 christos }
1792 1.39 christos cs->types = newtypes;
1793 1.39 christos cs->types[cs->ntypes++] = wct;
1794 1.1 jtc }
1795 1.1 jtc
1796 1.1 jtc /*
1797 1.1 jtc - dupl - emit a duplicate of a bunch of sops
1798 1.9 perry == static sopno dupl(struct parse *p, sopno start, sopno finish);
1799 1.1 jtc */
1800 1.1 jtc static sopno /* start of duplicate */
1801 1.39 christos dupl(struct parse *p,
1802 1.39 christos sopno start, /* from here */
1803 1.39 christos sopno finish) /* to this less one */
1804 1.1 jtc {
1805 1.39 christos sopno ret = HERE();
1806 1.9 perry sopno len = finish - start;
1807 1.1 jtc
1808 1.14 lukem _DIAGASSERT(p != NULL);
1809 1.14 lukem
1810 1.1 jtc assert(finish >= start);
1811 1.1 jtc if (len == 0)
1812 1.1 jtc return(ret);
1813 1.39 christos if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1814 1.39 christos return(ret);
1815 1.40 christos (void) memcpy(p->strip + p->slen,
1816 1.40 christos p->strip + start, len * sizeof(*p->strip));
1817 1.1 jtc p->slen += len;
1818 1.1 jtc return(ret);
1819 1.1 jtc }
1820 1.1 jtc
1821 1.1 jtc /*
1822 1.1 jtc - doemit - emit a strip operator
1823 1.9 perry == static void doemit(struct parse *p, sop op, size_t opnd);
1824 1.1 jtc *
1825 1.1 jtc * It might seem better to implement this as a macro with a function as
1826 1.1 jtc * hard-case backup, but it's just too big and messy unless there are
1827 1.1 jtc * some changes to the data structures. Maybe later.
1828 1.1 jtc */
1829 1.1 jtc static void
1830 1.39 christos doemit(struct parse *p, sop op, size_t opnd)
1831 1.1 jtc {
1832 1.1 jtc /* avoid making error situations worse */
1833 1.1 jtc if (p->error != 0)
1834 1.1 jtc return;
1835 1.1 jtc
1836 1.39 christos _DIAGASSERT(p != NULL);
1837 1.39 christos
1838 1.1 jtc /* deal with oversize operands ("can't happen", more or less) */
1839 1.1 jtc assert(opnd < 1<<OPSHIFT);
1840 1.1 jtc
1841 1.1 jtc /* deal with undersized strip */
1842 1.1 jtc if (p->slen >= p->ssize)
1843 1.30 christos if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1844 1.30 christos return;
1845 1.1 jtc
1846 1.1 jtc /* finally, it's all reduced to the easy case */
1847 1.40 christos p->strip[p->slen++] = (sopno)SOP(op, opnd);
1848 1.1 jtc }
1849 1.1 jtc
1850 1.1 jtc /*
1851 1.1 jtc - doinsert - insert a sop into the strip
1852 1.9 perry == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1853 1.1 jtc */
1854 1.1 jtc static void
1855 1.39 christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1856 1.1 jtc {
1857 1.9 perry sopno sn;
1858 1.9 perry sop s;
1859 1.9 perry int i;
1860 1.1 jtc
1861 1.14 lukem _DIAGASSERT(p != NULL);
1862 1.14 lukem
1863 1.1 jtc /* avoid making error situations worse */
1864 1.1 jtc if (p->error != 0)
1865 1.1 jtc return;
1866 1.1 jtc
1867 1.1 jtc sn = HERE();
1868 1.1 jtc EMIT(op, opnd); /* do checks, ensure space */
1869 1.1 jtc assert(HERE() == sn+1);
1870 1.1 jtc s = p->strip[sn];
1871 1.1 jtc
1872 1.1 jtc /* adjust paren pointers */
1873 1.1 jtc assert(pos > 0);
1874 1.1 jtc for (i = 1; i < NPAREN; i++) {
1875 1.1 jtc if (p->pbegin[i] >= pos) {
1876 1.1 jtc p->pbegin[i]++;
1877 1.1 jtc }
1878 1.1 jtc if (p->pend[i] >= pos) {
1879 1.1 jtc p->pend[i]++;
1880 1.1 jtc }
1881 1.1 jtc }
1882 1.1 jtc
1883 1.40 christos memmove(&p->strip[pos+1], &p->strip[pos],
1884 1.40 christos (HERE()-pos-1)*sizeof(*p->strip));
1885 1.1 jtc p->strip[pos] = s;
1886 1.1 jtc }
1887 1.1 jtc
1888 1.1 jtc /*
1889 1.1 jtc - dofwd - complete a forward reference
1890 1.9 perry == static void dofwd(struct parse *p, sopno pos, sop value);
1891 1.1 jtc */
1892 1.1 jtc static void
1893 1.39 christos dofwd(struct parse *p, sopno pos, sop value)
1894 1.1 jtc {
1895 1.14 lukem
1896 1.14 lukem _DIAGASSERT(p != NULL);
1897 1.14 lukem
1898 1.1 jtc /* avoid making error situations worse */
1899 1.1 jtc if (p->error != 0)
1900 1.1 jtc return;
1901 1.1 jtc
1902 1.1 jtc assert(value < 1<<OPSHIFT);
1903 1.39 christos p->strip[pos] = OP(p->strip[pos]) | value;
1904 1.1 jtc }
1905 1.1 jtc
1906 1.1 jtc /*
1907 1.1 jtc - enlarge - enlarge the strip
1908 1.39 christos == static int enlarge(struct parse *p, sopno size);
1909 1.1 jtc */
1910 1.30 christos static int
1911 1.35 joerg enlarge(struct parse *p, sopno size)
1912 1.1 jtc {
1913 1.39 christos sop *sp;
1914 1.39 christos
1915 1.14 lukem _DIAGASSERT(p != NULL);
1916 1.14 lukem
1917 1.1 jtc if (p->ssize >= size)
1918 1.30 christos return 1;
1919 1.1 jtc
1920 1.40 christos sp = reallocarray(p->strip, size, sizeof(*p->strip));
1921 1.39 christos if (sp == NULL) {
1922 1.1 jtc SETERROR(REG_ESPACE);
1923 1.30 christos return 0;
1924 1.1 jtc }
1925 1.39 christos p->strip = sp;
1926 1.35 joerg p->ssize = size;
1927 1.30 christos return 1;
1928 1.1 jtc }
1929 1.1 jtc
1930 1.1 jtc /*
1931 1.1 jtc - stripsnug - compact the strip
1932 1.9 perry == static void stripsnug(struct parse *p, struct re_guts *g);
1933 1.1 jtc */
1934 1.1 jtc static void
1935 1.39 christos stripsnug(struct parse *p, struct re_guts *g)
1936 1.1 jtc {
1937 1.14 lukem
1938 1.14 lukem _DIAGASSERT(p != NULL);
1939 1.14 lukem _DIAGASSERT(g != NULL);
1940 1.14 lukem
1941 1.1 jtc g->nstates = p->slen;
1942 1.40 christos g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1943 1.39 christos if (g->strip == NULL) {
1944 1.39 christos SETERROR(REG_ESPACE);
1945 1.39 christos g->strip = p->strip;
1946 1.39 christos }
1947 1.1 jtc }
1948 1.1 jtc
1949 1.1 jtc /*
1950 1.1 jtc - findmust - fill in must and mlen with longest mandatory literal string
1951 1.9 perry == static void findmust(struct parse *p, struct re_guts *g);
1952 1.1 jtc *
1953 1.1 jtc * This algorithm could do fancy things like analyzing the operands of |
1954 1.1 jtc * for common subsequences. Someday. This code is simple and finds most
1955 1.1 jtc * of the interesting cases.
1956 1.1 jtc *
1957 1.1 jtc * Note that must and mlen got initialized during setup.
1958 1.1 jtc */
1959 1.1 jtc static void
1960 1.39 christos findmust(struct parse *p, struct re_guts *g)
1961 1.1 jtc {
1962 1.9 perry sop *scan;
1963 1.7 christos sop *start = NULL;
1964 1.9 perry sop *newstart = NULL;
1965 1.9 perry sopno newlen;
1966 1.9 perry sop s;
1967 1.9 perry char *cp;
1968 1.39 christos int offset;
1969 1.39 christos mbstate_t mbs;
1970 1.1 jtc
1971 1.14 lukem _DIAGASSERT(p != NULL);
1972 1.14 lukem _DIAGASSERT(g != NULL);
1973 1.14 lukem
1974 1.1 jtc /* avoid making error situations worse */
1975 1.1 jtc if (p->error != 0)
1976 1.1 jtc return;
1977 1.1 jtc
1978 1.39 christos #ifdef notyet
1979 1.39 christos /*
1980 1.39 christos * It's not generally safe to do a ``char'' substring search on
1981 1.39 christos * multibyte character strings, but it's safe for at least
1982 1.39 christos * UTF-8 (see RFC 3629).
1983 1.39 christos */
1984 1.39 christos if (MB_CUR_MAX > 1 &&
1985 1.39 christos strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1986 1.39 christos return;
1987 1.39 christos #endif
1988 1.39 christos
1989 1.1 jtc /* find the longest OCHAR sequence in strip */
1990 1.1 jtc newlen = 0;
1991 1.39 christos offset = 0;
1992 1.39 christos g->moffset = 0;
1993 1.1 jtc scan = g->strip + 1;
1994 1.1 jtc do {
1995 1.1 jtc s = *scan++;
1996 1.1 jtc switch (OP(s)) {
1997 1.1 jtc case OCHAR: /* sequence member */
1998 1.39 christos if (newlen == 0) { /* new sequence */
1999 1.39 christos memset(&mbs, 0, sizeof(mbs));
2000 1.1 jtc newstart = scan - 1;
2001 1.39 christos }
2002 1.42 christos #ifdef NLS
2003 1.42 christos char buf[MB_LEN_MAX];
2004 1.42 christos size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2005 1.39 christos if (clen == (size_t)-1)
2006 1.39 christos goto toohard;
2007 1.40 christos newlen += (sopno)clen;
2008 1.42 christos #else
2009 1.42 christos newlen++;
2010 1.42 christos #endif
2011 1.1 jtc break;
2012 1.1 jtc case OPLUS_: /* things that don't break one */
2013 1.1 jtc case OLPAREN:
2014 1.1 jtc case ORPAREN:
2015 1.1 jtc break;
2016 1.1 jtc case OQUEST_: /* things that must be skipped */
2017 1.1 jtc case OCH_:
2018 1.39 christos offset = altoffset(scan, offset);
2019 1.1 jtc scan--;
2020 1.1 jtc do {
2021 1.1 jtc scan += OPND(s);
2022 1.1 jtc s = *scan;
2023 1.1 jtc /* assert() interferes w debug printouts */
2024 1.40 christos if (OP(s) != O_QUEST &&
2025 1.40 christos OP(s) != O_CH && OP(s) != OOR2) {
2026 1.1 jtc g->iflags |= BAD;
2027 1.1 jtc return;
2028 1.1 jtc }
2029 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2030 1.11 christos /* FALLTHROUGH */
2031 1.39 christos case OBOW: /* things that break a sequence */
2032 1.39 christos case OEOW:
2033 1.39 christos case OBOL:
2034 1.39 christos case OEOL:
2035 1.39 christos case OBOS:
2036 1.39 christos case OEOS:
2037 1.39 christos case OWBND:
2038 1.39 christos case ONWBND:
2039 1.39 christos case O_QUEST:
2040 1.39 christos case O_CH:
2041 1.39 christos case OEND:
2042 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2043 1.1 jtc start = newstart;
2044 1.1 jtc g->mlen = newlen;
2045 1.39 christos if (offset > -1) {
2046 1.39 christos g->moffset += offset;
2047 1.39 christos offset = newlen;
2048 1.39 christos } else
2049 1.39 christos g->moffset = offset;
2050 1.39 christos } else {
2051 1.39 christos if (offset > -1)
2052 1.39 christos offset += newlen;
2053 1.1 jtc }
2054 1.1 jtc newlen = 0;
2055 1.1 jtc break;
2056 1.39 christos case OANY:
2057 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2058 1.39 christos start = newstart;
2059 1.39 christos g->mlen = newlen;
2060 1.39 christos if (offset > -1) {
2061 1.39 christos g->moffset += offset;
2062 1.39 christos offset = newlen;
2063 1.39 christos } else
2064 1.39 christos g->moffset = offset;
2065 1.39 christos } else {
2066 1.39 christos if (offset > -1)
2067 1.39 christos offset += newlen;
2068 1.39 christos }
2069 1.39 christos if (offset > -1)
2070 1.39 christos offset++;
2071 1.39 christos newlen = 0;
2072 1.39 christos break;
2073 1.39 christos case OANYOF: /* may or may not invalidate offset */
2074 1.39 christos /* First, everything as OANY */
2075 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2076 1.39 christos start = newstart;
2077 1.39 christos g->mlen = newlen;
2078 1.39 christos if (offset > -1) {
2079 1.39 christos g->moffset += offset;
2080 1.39 christos offset = newlen;
2081 1.39 christos } else
2082 1.39 christos g->moffset = offset;
2083 1.39 christos } else {
2084 1.39 christos if (offset > -1)
2085 1.39 christos offset += newlen;
2086 1.39 christos }
2087 1.39 christos if (offset > -1)
2088 1.39 christos offset++;
2089 1.39 christos newlen = 0;
2090 1.39 christos break;
2091 1.42 christos #ifdef NLS
2092 1.40 christos toohard:/*FALLTHROUGH*/
2093 1.42 christos #endif
2094 1.39 christos default:
2095 1.39 christos /* Anything here makes it impossible or too hard
2096 1.39 christos * to calculate the offset -- so we give up;
2097 1.39 christos * save the last known good offset, in case the
2098 1.39 christos * must sequence doesn't occur later.
2099 1.39 christos */
2100 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2101 1.39 christos start = newstart;
2102 1.39 christos g->mlen = newlen;
2103 1.39 christos if (offset > -1)
2104 1.39 christos g->moffset += offset;
2105 1.39 christos else
2106 1.39 christos g->moffset = offset;
2107 1.39 christos }
2108 1.39 christos offset = -1;
2109 1.39 christos newlen = 0;
2110 1.39 christos break;
2111 1.1 jtc }
2112 1.1 jtc } while (OP(s) != OEND);
2113 1.1 jtc
2114 1.39 christos if (g->mlen == 0) { /* there isn't one */
2115 1.39 christos g->moffset = -1;
2116 1.1 jtc return;
2117 1.39 christos }
2118 1.1 jtc
2119 1.1 jtc /* turn it into a character string */
2120 1.1 jtc g->must = malloc((size_t)g->mlen + 1);
2121 1.1 jtc if (g->must == NULL) { /* argh; just forget it */
2122 1.1 jtc g->mlen = 0;
2123 1.39 christos g->moffset = -1;
2124 1.1 jtc return;
2125 1.1 jtc }
2126 1.1 jtc cp = g->must;
2127 1.1 jtc scan = start;
2128 1.39 christos memset(&mbs, 0, sizeof(mbs));
2129 1.39 christos while (cp < g->must + g->mlen) {
2130 1.1 jtc while (OP(s = *scan++) != OCHAR)
2131 1.1 jtc continue;
2132 1.42 christos #ifdef NLS
2133 1.40 christos clen = wcrtomb(cp, (int)OPND(s), &mbs);
2134 1.39 christos assert(clen != (size_t)-1);
2135 1.39 christos cp += clen;
2136 1.42 christos #else
2137 1.42 christos *cp++ = OPND(s);
2138 1.42 christos #endif
2139 1.1 jtc }
2140 1.2 jtc assert(cp == g->must + g->mlen);
2141 1.1 jtc *cp++ = '\0'; /* just on general principles */
2142 1.1 jtc }
2143 1.1 jtc
2144 1.1 jtc /*
2145 1.39 christos - altoffset - choose biggest offset among multiple choices
2146 1.39 christos == static int altoffset(sop *scan, int offset);
2147 1.39 christos *
2148 1.39 christos * Compute, recursively if necessary, the largest offset among multiple
2149 1.39 christos * re paths.
2150 1.39 christos */
2151 1.39 christos static int
2152 1.39 christos altoffset(sop *scan, int offset)
2153 1.39 christos {
2154 1.39 christos int largest;
2155 1.39 christos int try;
2156 1.39 christos sop s;
2157 1.39 christos
2158 1.39 christos _DIAGASSERT(scan != NULL);
2159 1.39 christos
2160 1.39 christos /* If we gave up already on offsets, return */
2161 1.39 christos if (offset == -1)
2162 1.39 christos return -1;
2163 1.39 christos
2164 1.39 christos largest = 0;
2165 1.39 christos try = 0;
2166 1.39 christos s = *scan++;
2167 1.40 christos while (OP(s) != O_QUEST && OP(s) != O_CH) {
2168 1.39 christos switch (OP(s)) {
2169 1.39 christos case OOR1:
2170 1.39 christos if (try > largest)
2171 1.39 christos largest = try;
2172 1.39 christos try = 0;
2173 1.39 christos break;
2174 1.39 christos case OQUEST_:
2175 1.39 christos case OCH_:
2176 1.39 christos try = altoffset(scan, try);
2177 1.39 christos if (try == -1)
2178 1.39 christos return -1;
2179 1.39 christos scan--;
2180 1.39 christos do {
2181 1.39 christos scan += OPND(s);
2182 1.39 christos s = *scan;
2183 1.40 christos if (OP(s) != O_QUEST &&
2184 1.40 christos OP(s) != O_CH && OP(s) != OOR2)
2185 1.39 christos return -1;
2186 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2187 1.39 christos /* We must skip to the next position, or we'll
2188 1.39 christos * leave altoffset() too early.
2189 1.39 christos */
2190 1.39 christos scan++;
2191 1.39 christos break;
2192 1.39 christos case OANYOF:
2193 1.39 christos case OCHAR:
2194 1.39 christos case OANY:
2195 1.39 christos try++;
2196 1.40 christos /*FALLTHROUGH*/
2197 1.39 christos case OBOW:
2198 1.39 christos case OEOW:
2199 1.39 christos case OWBND:
2200 1.39 christos case ONWBND:
2201 1.39 christos case OLPAREN:
2202 1.39 christos case ORPAREN:
2203 1.39 christos case OOR2:
2204 1.39 christos break;
2205 1.39 christos default:
2206 1.39 christos try = -1;
2207 1.39 christos break;
2208 1.39 christos }
2209 1.39 christos if (try == -1)
2210 1.39 christos return -1;
2211 1.39 christos s = *scan++;
2212 1.39 christos }
2213 1.39 christos
2214 1.39 christos if (try > largest)
2215 1.39 christos largest = try;
2216 1.39 christos
2217 1.39 christos return largest+offset;
2218 1.39 christos }
2219 1.39 christos
2220 1.39 christos /*
2221 1.39 christos - computejumps - compute char jumps for BM scan
2222 1.39 christos == static void computejumps(struct parse *p, struct re_guts *g);
2223 1.39 christos *
2224 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2225 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2226 1.39 christos * Sara Baase.
2227 1.39 christos *
2228 1.39 christos * A char jump is the number of characters one needs to jump based on
2229 1.39 christos * the value of the character from the text that was mismatched.
2230 1.39 christos */
2231 1.39 christos static void
2232 1.39 christos computejumps(struct parse *p, struct re_guts *g)
2233 1.39 christos {
2234 1.39 christos int ch;
2235 1.39 christos size_t mindex;
2236 1.39 christos
2237 1.39 christos _DIAGASSERT(p != NULL);
2238 1.39 christos _DIAGASSERT(g != NULL);
2239 1.39 christos
2240 1.39 christos /* Avoid making errors worse */
2241 1.39 christos if (p->error != 0)
2242 1.39 christos return;
2243 1.39 christos
2244 1.39 christos g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2245 1.39 christos if (g->charjump == NULL) /* Not a fatal error */
2246 1.39 christos return;
2247 1.39 christos /* Adjust for signed chars, if necessary */
2248 1.39 christos g->charjump = &g->charjump[-(CHAR_MIN)];
2249 1.39 christos
2250 1.39 christos /* If the character does not exist in the pattern, the jump
2251 1.39 christos * is equal to the number of characters in the pattern.
2252 1.39 christos */
2253 1.39 christos for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2254 1.39 christos g->charjump[ch] = g->mlen;
2255 1.39 christos
2256 1.39 christos /* If the character does exist, compute the jump that would
2257 1.39 christos * take us to the last character in the pattern equal to it
2258 1.39 christos * (notice that we match right to left, so that last character
2259 1.39 christos * is the first one that would be matched).
2260 1.39 christos */
2261 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2262 1.39 christos g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2263 1.39 christos }
2264 1.39 christos
2265 1.39 christos /*
2266 1.39 christos - computematchjumps - compute match jumps for BM scan
2267 1.39 christos == static void computematchjumps(struct parse *p, struct re_guts *g);
2268 1.39 christos *
2269 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2270 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2271 1.39 christos * Sara Baase.
2272 1.39 christos *
2273 1.39 christos * A match jump is the number of characters one needs to advance based
2274 1.39 christos * on the already-matched suffix.
2275 1.39 christos * Notice that all values here are minus (g->mlen-1), because of the way
2276 1.39 christos * the search algorithm works.
2277 1.39 christos */
2278 1.39 christos static void
2279 1.39 christos computematchjumps(struct parse *p, struct re_guts *g)
2280 1.39 christos {
2281 1.39 christos size_t mindex; /* General "must" iterator */
2282 1.39 christos size_t suffix; /* Keeps track of matching suffix */
2283 1.39 christos size_t ssuffix; /* Keeps track of suffixes' suffix */
2284 1.39 christos size_t* pmatches; /* pmatches[k] points to the next i
2285 1.39 christos * such that i+1...mlen is a substring
2286 1.39 christos * of k+1...k+mlen-i-1
2287 1.39 christos */
2288 1.39 christos
2289 1.39 christos _DIAGASSERT(p != NULL);
2290 1.39 christos _DIAGASSERT(g != NULL);
2291 1.39 christos
2292 1.39 christos /* Avoid making errors worse */
2293 1.39 christos if (p->error != 0)
2294 1.39 christos return;
2295 1.39 christos
2296 1.39 christos pmatches = calloc(g->mlen, sizeof(*pmatches));
2297 1.39 christos if (pmatches == NULL) {
2298 1.39 christos g->matchjump = NULL;
2299 1.39 christos return;
2300 1.39 christos }
2301 1.39 christos
2302 1.39 christos g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2303 1.39 christos if (g->matchjump == NULL) { /* Not a fatal error */
2304 1.39 christos free(pmatches);
2305 1.39 christos return;
2306 1.39 christos }
2307 1.39 christos
2308 1.39 christos /* Set maximum possible jump for each character in the pattern */
2309 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2310 1.39 christos g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2311 1.39 christos
2312 1.39 christos /* Compute pmatches[] */
2313 1.39 christos for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2314 1.39 christos pmatches[mindex] = suffix;
2315 1.39 christos
2316 1.39 christos /* If a mismatch is found, interrupting the substring,
2317 1.39 christos * compute the matchjump for that position. If no
2318 1.39 christos * mismatch is found, then a text substring mismatched
2319 1.39 christos * against the suffix will also mismatch against the
2320 1.39 christos * substring.
2321 1.39 christos */
2322 1.39 christos while (suffix < g->mlen
2323 1.39 christos && g->must[mindex] != g->must[suffix]) {
2324 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2325 1.39 christos g->mlen - mindex - 1);
2326 1.39 christos suffix = pmatches[suffix];
2327 1.39 christos }
2328 1.39 christos }
2329 1.39 christos
2330 1.39 christos /* Compute the matchjump up to the last substring found to jump
2331 1.39 christos * to the beginning of the largest must pattern prefix matching
2332 1.39 christos * it's own suffix.
2333 1.39 christos */
2334 1.39 christos for (mindex = 0; mindex <= suffix; mindex++)
2335 1.39 christos g->matchjump[mindex] = MIN(g->matchjump[mindex],
2336 1.39 christos g->mlen + suffix - mindex);
2337 1.39 christos
2338 1.39 christos ssuffix = pmatches[suffix];
2339 1.39 christos while (suffix < g->mlen) {
2340 1.39 christos while (suffix <= ssuffix && suffix < g->mlen) {
2341 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2342 1.39 christos g->mlen + ssuffix - suffix);
2343 1.39 christos suffix++;
2344 1.39 christos }
2345 1.39 christos if (suffix < g->mlen)
2346 1.39 christos ssuffix = pmatches[ssuffix];
2347 1.39 christos }
2348 1.39 christos
2349 1.39 christos free(pmatches);
2350 1.39 christos }
2351 1.39 christos
2352 1.39 christos /*
2353 1.1 jtc - pluscount - count + nesting
2354 1.9 perry == static sopno pluscount(struct parse *p, struct re_guts *g);
2355 1.1 jtc */
2356 1.1 jtc static sopno /* nesting depth */
2357 1.39 christos pluscount(struct parse *p, struct re_guts *g)
2358 1.1 jtc {
2359 1.9 perry sop *scan;
2360 1.9 perry sop s;
2361 1.9 perry sopno plusnest = 0;
2362 1.9 perry sopno maxnest = 0;
2363 1.14 lukem
2364 1.14 lukem _DIAGASSERT(p != NULL);
2365 1.14 lukem _DIAGASSERT(g != NULL);
2366 1.1 jtc
2367 1.1 jtc if (p->error != 0)
2368 1.1 jtc return(0); /* there may not be an OEND */
2369 1.1 jtc
2370 1.1 jtc scan = g->strip + 1;
2371 1.1 jtc do {
2372 1.1 jtc s = *scan++;
2373 1.1 jtc switch (OP(s)) {
2374 1.1 jtc case OPLUS_:
2375 1.1 jtc plusnest++;
2376 1.1 jtc break;
2377 1.1 jtc case O_PLUS:
2378 1.1 jtc if (plusnest > maxnest)
2379 1.1 jtc maxnest = plusnest;
2380 1.1 jtc plusnest--;
2381 1.1 jtc break;
2382 1.1 jtc }
2383 1.1 jtc } while (OP(s) != OEND);
2384 1.1 jtc if (plusnest != 0)
2385 1.1 jtc g->iflags |= BAD;
2386 1.1 jtc return(maxnest);
2387 1.1 jtc }
2388