regcomp.c revision 1.45 1 1.45 christos /* $NetBSD: regcomp.c,v 1.45 2021/02/26 19:24:47 christos Exp $ */
2 1.6 cgd
3 1.5 cgd /*-
4 1.39 christos * SPDX-License-Identifier: BSD-3-Clause
5 1.39 christos *
6 1.39 christos * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 1.5 cgd * Copyright (c) 1992, 1993, 1994
8 1.5 cgd * The Regents of the University of California. All rights reserved.
9 1.5 cgd *
10 1.39 christos * Copyright (c) 2011 The FreeBSD Foundation
11 1.39 christos * All rights reserved.
12 1.39 christos * Portions of this software were developed by David Chisnall
13 1.39 christos * under sponsorship from the FreeBSD Foundation.
14 1.39 christos *
15 1.5 cgd * This code is derived from software contributed to Berkeley by
16 1.5 cgd * Henry Spencer.
17 1.5 cgd *
18 1.5 cgd * Redistribution and use in source and binary forms, with or without
19 1.5 cgd * modification, are permitted provided that the following conditions
20 1.5 cgd * are met:
21 1.5 cgd * 1. Redistributions of source code must retain the above copyright
22 1.5 cgd * notice, this list of conditions and the following disclaimer.
23 1.5 cgd * 2. Redistributions in binary form must reproduce the above copyright
24 1.5 cgd * notice, this list of conditions and the following disclaimer in the
25 1.5 cgd * documentation and/or other materials provided with the distribution.
26 1.18 agc * 3. Neither the name of the University nor the names of its contributors
27 1.18 agc * may be used to endorse or promote products derived from this software
28 1.18 agc * without specific prior written permission.
29 1.18 agc *
30 1.18 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 1.18 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 1.18 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 1.18 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 1.18 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 1.18 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 1.18 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 1.18 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 1.18 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 1.18 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 1.18 agc * SUCH DAMAGE.
41 1.18 agc *
42 1.18 agc * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 1.18 agc */
44 1.18 agc
45 1.45 christos #if HAVE_NBTOOL_CONFIG_H
46 1.45 christos #include "nbtool_config.h"
47 1.45 christos #endif
48 1.45 christos
49 1.7 christos #include <sys/cdefs.h>
50 1.6 cgd #if 0
51 1.5 cgd static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
52 1.39 christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53 1.6 cgd #endif
54 1.45 christos __RCSID("$NetBSD: regcomp.c,v 1.45 2021/02/26 19:24:47 christos Exp $");
55 1.39 christos
56 1.39 christos #define _OPENBSD_SOURCE
57 1.42 christos
58 1.42 christos #ifndef LIBHACK
59 1.39 christos #define REGEX_GNU_EXTENSIONS
60 1.5 cgd
61 1.8 jtc #include "namespace.h"
62 1.42 christos #endif
63 1.1 jtc #include <sys/types.h>
64 1.39 christos #include <stdio.h>
65 1.39 christos #include <string.h>
66 1.1 jtc #include <ctype.h>
67 1.1 jtc #include <limits.h>
68 1.1 jtc #include <stdlib.h>
69 1.28 junyoung #include <regex.h>
70 1.39 christos #include <stdbool.h>
71 1.8 jtc
72 1.44 christos #if defined(__weak_alias) && !defined(LIBHACK)
73 1.16 mycroft __weak_alias(regcomp,_regcomp)
74 1.8 jtc #endif
75 1.1 jtc
76 1.39 christos #ifdef REGEX_LIBC_COLLATE
77 1.39 christos #include "collate.h"
78 1.39 christos #endif
79 1.39 christos
80 1.1 jtc #include "utils.h"
81 1.1 jtc #include "regex2.h"
82 1.1 jtc
83 1.1 jtc #include "cname.h"
84 1.1 jtc
85 1.1 jtc /*
86 1.39 christos * Branching context, used to keep track of branch state for all of the branch-
87 1.39 christos * aware functions. In addition to keeping track of branch positions for the
88 1.39 christos * p_branch_* functions, we use this to simplify some clumsiness in BREs for
89 1.39 christos * detection of whether ^ is acting as an anchor or being used erroneously and
90 1.39 christos * also for whether we're in a sub-expression or not.
91 1.39 christos */
92 1.39 christos struct branchc {
93 1.39 christos sopno start;
94 1.39 christos sopno back;
95 1.39 christos sopno fwd;
96 1.39 christos
97 1.39 christos int nbranch;
98 1.39 christos int nchain;
99 1.39 christos bool outer;
100 1.39 christos bool terminate;
101 1.39 christos };
102 1.39 christos
103 1.39 christos /*
104 1.1 jtc * parse structure, passed up and down to avoid global variables and
105 1.1 jtc * other clumsinesses
106 1.1 jtc */
107 1.1 jtc struct parse {
108 1.21 yamt const char *next; /* next character in RE */
109 1.21 yamt const char *end; /* end of string (-> NUL normally) */
110 1.1 jtc int error; /* has an error been seen? */
111 1.39 christos int gnuext;
112 1.1 jtc sop *strip; /* malloced strip */
113 1.1 jtc sopno ssize; /* malloced strip size (allocated) */
114 1.1 jtc sopno slen; /* malloced strip length (used) */
115 1.33 christos size_t ncsalloc; /* number of csets allocated */
116 1.1 jtc struct re_guts *g;
117 1.1 jtc # define NPAREN 10 /* we need to remember () 1-9 for back refs */
118 1.1 jtc sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
119 1.1 jtc sopno pend[NPAREN]; /* -> ) ([0] unused) */
120 1.39 christos bool allowbranch; /* can this expression branch? */
121 1.39 christos bool bre; /* convenience; is this a BRE? */
122 1.39 christos int pflags; /* other parsing flags -- legacy escapes? */
123 1.39 christos bool (*parse_expr)(struct parse *, struct branchc *);
124 1.39 christos void (*pre_parse)(struct parse *, struct branchc *);
125 1.39 christos void (*post_parse)(struct parse *, struct branchc *);
126 1.1 jtc };
127 1.1 jtc
128 1.39 christos #define PFLAG_LEGACY_ESC 0x00000001
129 1.39 christos
130 1.5 cgd /* ========= begin header generated by ./mkh ========= */
131 1.5 cgd #ifdef __cplusplus
132 1.5 cgd extern "C" {
133 1.5 cgd #endif
134 1.5 cgd
135 1.5 cgd /* === regcomp.c === */
136 1.39 christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
137 1.26 junyoung static void p_str(struct parse *p);
138 1.39 christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
139 1.39 christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
140 1.39 christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
141 1.39 christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
142 1.39 christos static bool p_branch_do(struct parse *p, struct branchc *bc);
143 1.39 christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
144 1.39 christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
145 1.39 christos static void p_re(struct parse *p, int end1, int end2);
146 1.39 christos static bool p_simp_re(struct parse *p, struct branchc *bc);
147 1.26 junyoung static int p_count(struct parse *p);
148 1.26 junyoung static void p_bracket(struct parse *p);
149 1.39 christos static int p_range_cmp(wchar_t c1, wchar_t c2);
150 1.26 junyoung static void p_b_term(struct parse *p, cset *cs);
151 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
152 1.39 christos static int p_b_pseudoclass(struct parse *p, char c);
153 1.39 christos #endif
154 1.26 junyoung static void p_b_cclass(struct parse *p, cset *cs);
155 1.39 christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
156 1.26 junyoung static void p_b_eclass(struct parse *p, cset *cs);
157 1.39 christos static wint_t p_b_symbol(struct parse *p);
158 1.39 christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
159 1.39 christos static bool may_escape(struct parse *p, const wint_t ch);
160 1.39 christos static wint_t othercase(wint_t ch);
161 1.39 christos static void bothcases(struct parse *p, wint_t ch);
162 1.39 christos static void ordinary(struct parse *p, wint_t ch);
163 1.26 junyoung static void nonnewline(struct parse *p);
164 1.39 christos static void repeat(struct parse *p, sopno start, int from, int to);
165 1.26 junyoung static int seterr(struct parse *p, int e);
166 1.26 junyoung static cset *allocset(struct parse *p);
167 1.26 junyoung static void freeset(struct parse *p, cset *cs);
168 1.39 christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
169 1.39 christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
170 1.39 christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
171 1.39 christos static wint_t singleton(cset *cs);
172 1.26 junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
173 1.39 christos static void doemit(struct parse *p, sop op, size_t opnd);
174 1.39 christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
175 1.39 christos static void dofwd(struct parse *p, sopno pos, sop value);
176 1.30 christos static int enlarge(struct parse *p, sopno size);
177 1.26 junyoung static void stripsnug(struct parse *p, struct re_guts *g);
178 1.26 junyoung static void findmust(struct parse *p, struct re_guts *g);
179 1.39 christos static int altoffset(sop *scan, int offset);
180 1.39 christos static void computejumps(struct parse *p, struct re_guts *g);
181 1.39 christos static void computematchjumps(struct parse *p, struct re_guts *g);
182 1.26 junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
183 1.39 christos static wint_t wgetnext(struct parse *p);
184 1.5 cgd
185 1.5 cgd #ifdef __cplusplus
186 1.5 cgd }
187 1.5 cgd #endif
188 1.5 cgd /* ========= end header generated by ./mkh ========= */
189 1.1 jtc
190 1.1 jtc static char nuls[10]; /* place to point scanner in event of error */
191 1.1 jtc
192 1.1 jtc /*
193 1.1 jtc * macros for use with parse structure
194 1.1 jtc * BEWARE: these know that the parse structure is named `p' !!!
195 1.1 jtc */
196 1.1 jtc #define PEEK() (*p->next)
197 1.1 jtc #define PEEK2() (*(p->next+1))
198 1.1 jtc #define MORE() (p->next < p->end)
199 1.1 jtc #define MORE2() (p->next+1 < p->end)
200 1.1 jtc #define SEE(c) (MORE() && PEEK() == (c))
201 1.1 jtc #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
202 1.39 christos #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
203 1.1 jtc #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
204 1.1 jtc #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
205 1.39 christos #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
206 1.1 jtc #define NEXT() (p->next++)
207 1.1 jtc #define NEXT2() (p->next += 2)
208 1.1 jtc #define NEXTn(n) (p->next += (n))
209 1.1 jtc #define GETNEXT() (*p->next++)
210 1.39 christos #define WGETNEXT() wgetnext(p)
211 1.1 jtc #define SETERROR(e) seterr(p, (e))
212 1.39 christos #define REQUIRE(co, e) ((co) || SETERROR(e))
213 1.1 jtc #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
214 1.39 christos #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
215 1.1 jtc #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
216 1.40 christos #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
217 1.40 christos #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
218 1.12 drochner #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
219 1.1 jtc #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
220 1.1 jtc #define HERE() (p->slen)
221 1.1 jtc #define THERE() (p->slen - 1)
222 1.4 jtc #define THERETHERE() (p->slen - 2)
223 1.1 jtc #define DROP(n) (p->slen -= (n))
224 1.1 jtc
225 1.39 christos /* Macro used by computejump()/computematchjump() */
226 1.41 christos #ifndef MIN
227 1.39 christos #define MIN(a,b) ((a)<(b)?(a):(b))
228 1.41 christos #endif
229 1.30 christos
230 1.42 christos #ifndef NLS
231 1.42 christos static const struct {
232 1.42 christos const char *name;
233 1.42 christos int (*func)(int);
234 1.42 christos } wctypes[] = {
235 1.42 christos #define ADD(x) { .name = # x, .func = is ## x }
236 1.42 christos ADD(alnum),
237 1.42 christos ADD(alpha),
238 1.42 christos ADD(blank),
239 1.42 christos ADD(cntrl),
240 1.42 christos ADD(digit),
241 1.42 christos ADD(graph),
242 1.42 christos ADD(lower),
243 1.42 christos ADD(print),
244 1.42 christos ADD(punct),
245 1.42 christos ADD(space),
246 1.42 christos ADD(upper),
247 1.42 christos ADD(xdigit),
248 1.42 christos #undef ADD
249 1.42 christos };
250 1.42 christos
251 1.42 christos wctype_t
252 1.42 christos __regex_wctype(const char *str)
253 1.42 christos {
254 1.42 christos for (size_t i = 0; i < __arraycount(wctypes); i++) {
255 1.42 christos if (strcmp(wctypes[i].name, str) == 0)
256 1.42 christos return (wctype_t)(i + 1);
257 1.42 christos }
258 1.42 christos return (wctype_t)0;
259 1.42 christos }
260 1.42 christos
261 1.42 christos int
262 1.42 christos __regex_iswctype(wint_t c, wctype_t ct)
263 1.42 christos {
264 1.42 christos if (ct == 0)
265 1.42 christos return 0;
266 1.42 christos return (*wctypes[ct - 1].func)(c);
267 1.42 christos }
268 1.42 christos #endif
269 1.42 christos
270 1.39 christos static int /* 0 success, otherwise REG_something */
271 1.39 christos regcomp_internal(regex_t * __restrict preg,
272 1.39 christos const char * __restrict pattern,
273 1.39 christos int cflags, int pflags)
274 1.1 jtc {
275 1.1 jtc struct parse pa;
276 1.9 perry struct re_guts *g;
277 1.9 perry struct parse *p = &pa;
278 1.9 perry int i;
279 1.9 perry size_t len;
280 1.39 christos size_t maxlen;
281 1.3 jtc #ifdef REDEBUG
282 1.3 jtc # define GOODFLAGS(f) (f)
283 1.3 jtc #else
284 1.3 jtc # define GOODFLAGS(f) ((f)&~REG_DUMP)
285 1.3 jtc #endif
286 1.1 jtc
287 1.14 lukem _DIAGASSERT(preg != NULL);
288 1.14 lukem _DIAGASSERT(pattern != NULL);
289 1.14 lukem
290 1.3 jtc cflags = GOODFLAGS(cflags);
291 1.1 jtc if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
292 1.1 jtc return(REG_INVARG);
293 1.1 jtc
294 1.1 jtc if (cflags®_PEND) {
295 1.1 jtc if (preg->re_endp < pattern)
296 1.1 jtc return(REG_INVARG);
297 1.1 jtc len = preg->re_endp - pattern;
298 1.1 jtc } else
299 1.11 christos len = strlen(pattern);
300 1.1 jtc
301 1.1 jtc /* do the mallocs early so failure handling is easy */
302 1.39 christos g = malloc(sizeof(*g));
303 1.1 jtc if (g == NULL)
304 1.1 jtc return(REG_ESPACE);
305 1.39 christos /*
306 1.39 christos * Limit the pattern space to avoid a 32-bit overflow on buffer
307 1.39 christos * extension. Also avoid any signed overflow in case of conversion
308 1.39 christos * so make the real limit based on a 31-bit overflow.
309 1.39 christos *
310 1.39 christos * Likely not applicable on 64-bit systems but handle the case
311 1.39 christos * generically (who are we to stop people from using ~715MB+
312 1.39 christos * patterns?).
313 1.39 christos */
314 1.40 christos maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
315 1.39 christos if (len >= maxlen) {
316 1.40 christos free(g);
317 1.39 christos return(REG_ESPACE);
318 1.39 christos }
319 1.40 christos p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
320 1.39 christos assert(p->ssize >= len);
321 1.39 christos
322 1.39 christos p->strip = calloc(p->ssize, sizeof(*p->strip));
323 1.1 jtc p->slen = 0;
324 1.1 jtc if (p->strip == NULL) {
325 1.40 christos free(g);
326 1.1 jtc return(REG_ESPACE);
327 1.1 jtc }
328 1.1 jtc
329 1.1 jtc /* set things up */
330 1.1 jtc p->g = g;
331 1.39 christos p->next = pattern; /* convenience; we do not modify it */
332 1.1 jtc p->end = p->next + len;
333 1.1 jtc p->error = 0;
334 1.1 jtc p->ncsalloc = 0;
335 1.39 christos p->pflags = pflags;
336 1.1 jtc for (i = 0; i < NPAREN; i++) {
337 1.1 jtc p->pbegin[i] = 0;
338 1.1 jtc p->pend[i] = 0;
339 1.1 jtc }
340 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
341 1.39 christos if ((cflags & REG_GNU) == 0) {
342 1.39 christos p->gnuext = false;
343 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
344 1.39 christos } else
345 1.39 christos p->gnuext = p->allowbranch = true;
346 1.39 christos #else
347 1.39 christos p->gnuext = false;
348 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
349 1.39 christos #endif
350 1.39 christos if (cflags & REG_EXTENDED) {
351 1.39 christos p->bre = false;
352 1.39 christos p->parse_expr = p_ere_exp;
353 1.39 christos p->pre_parse = NULL;
354 1.39 christos p->post_parse = NULL;
355 1.39 christos } else {
356 1.39 christos p->bre = true;
357 1.39 christos p->parse_expr = p_simp_re;
358 1.39 christos p->pre_parse = p_bre_pre_parse;
359 1.39 christos p->post_parse = p_bre_post_parse;
360 1.39 christos }
361 1.1 jtc g->sets = NULL;
362 1.1 jtc g->ncsets = 0;
363 1.1 jtc g->cflags = cflags;
364 1.1 jtc g->iflags = 0;
365 1.1 jtc g->nbol = 0;
366 1.1 jtc g->neol = 0;
367 1.1 jtc g->must = NULL;
368 1.39 christos g->moffset = -1;
369 1.39 christos g->charjump = NULL;
370 1.39 christos g->matchjump = NULL;
371 1.1 jtc g->mlen = 0;
372 1.1 jtc g->nsub = 0;
373 1.1 jtc g->backrefs = 0;
374 1.1 jtc
375 1.1 jtc /* do it */
376 1.1 jtc EMIT(OEND, 0);
377 1.1 jtc g->firststate = THERE();
378 1.39 christos if (cflags & REG_NOSPEC)
379 1.1 jtc p_str(p);
380 1.1 jtc else
381 1.39 christos p_re(p, OUT, OUT);
382 1.1 jtc EMIT(OEND, 0);
383 1.1 jtc g->laststate = THERE();
384 1.1 jtc
385 1.1 jtc /* tidy up loose ends and fill things in */
386 1.1 jtc stripsnug(p, g);
387 1.1 jtc findmust(p, g);
388 1.39 christos /* only use Boyer-Moore algorithm if the pattern is bigger
389 1.39 christos * than three characters
390 1.39 christos */
391 1.39 christos if(g->mlen > 3) {
392 1.39 christos computejumps(p, g);
393 1.39 christos computematchjumps(p, g);
394 1.39 christos if(g->matchjump == NULL && g->charjump != NULL) {
395 1.39 christos free(g->charjump);
396 1.39 christos g->charjump = NULL;
397 1.39 christos }
398 1.39 christos }
399 1.1 jtc g->nplus = pluscount(p, g);
400 1.1 jtc g->magic = MAGIC2;
401 1.1 jtc preg->re_nsub = g->nsub;
402 1.1 jtc preg->re_g = g;
403 1.1 jtc preg->re_magic = MAGIC1;
404 1.1 jtc #ifndef REDEBUG
405 1.1 jtc /* not debugging, so can't rely on the assert() in regexec() */
406 1.1 jtc if (g->iflags&BAD)
407 1.1 jtc SETERROR(REG_ASSERT);
408 1.1 jtc #endif
409 1.1 jtc
410 1.1 jtc /* win or lose, we're done */
411 1.1 jtc if (p->error != 0) /* lose */
412 1.1 jtc regfree(preg);
413 1.1 jtc return(p->error);
414 1.1 jtc }
415 1.1 jtc
416 1.1 jtc /*
417 1.39 christos - regcomp - interface for parser and compilation
418 1.39 christos = extern int regcomp(regex_t *, const char *, int);
419 1.39 christos = #define REG_BASIC 0000
420 1.39 christos = #define REG_EXTENDED 0001
421 1.39 christos = #define REG_ICASE 0002
422 1.39 christos = #define REG_NOSUB 0004
423 1.39 christos = #define REG_NEWLINE 0010
424 1.39 christos = #define REG_NOSPEC 0020
425 1.39 christos = #define REG_PEND 0040
426 1.39 christos = #define REG_DUMP 0200
427 1.1 jtc */
428 1.39 christos int /* 0 success, otherwise REG_something */
429 1.39 christos regcomp(regex_t * __restrict preg,
430 1.39 christos const char * __restrict pattern,
431 1.39 christos int cflags)
432 1.1 jtc {
433 1.30 christos
434 1.39 christos return (regcomp_internal(preg, pattern, cflags, 0));
435 1.1 jtc }
436 1.1 jtc
437 1.1 jtc /*
438 1.39 christos - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
439 1.39 christos - return whether we should terminate or not
440 1.39 christos == static bool p_ere_exp(struct parse *p);
441 1.1 jtc */
442 1.39 christos static bool
443 1.39 christos p_ere_exp(struct parse *p, struct branchc *bc)
444 1.1 jtc {
445 1.9 perry char c;
446 1.39 christos wint_t wc;
447 1.9 perry sopno pos;
448 1.9 perry int count;
449 1.9 perry int count2;
450 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
451 1.40 christos size_t i;
452 1.39 christos int handled;
453 1.39 christos #endif
454 1.9 perry sopno subno;
455 1.1 jtc int wascaret = 0;
456 1.1 jtc
457 1.14 lukem _DIAGASSERT(p != NULL);
458 1.14 lukem
459 1.39 christos (void)bc;
460 1.1 jtc assert(MORE()); /* caller should have ensured this */
461 1.1 jtc c = GETNEXT();
462 1.1 jtc
463 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
464 1.39 christos handled = 0;
465 1.39 christos #endif
466 1.1 jtc pos = HERE();
467 1.1 jtc switch (c) {
468 1.1 jtc case '(':
469 1.39 christos (void)REQUIRE(MORE(), REG_EPAREN);
470 1.1 jtc p->g->nsub++;
471 1.40 christos subno = (sopno)p->g->nsub;
472 1.1 jtc if (subno < NPAREN)
473 1.1 jtc p->pbegin[subno] = HERE();
474 1.1 jtc EMIT(OLPAREN, subno);
475 1.1 jtc if (!SEE(')'))
476 1.39 christos p_re(p, ')', IGN);
477 1.1 jtc if (subno < NPAREN) {
478 1.1 jtc p->pend[subno] = HERE();
479 1.1 jtc assert(p->pend[subno] != 0);
480 1.1 jtc }
481 1.1 jtc EMIT(ORPAREN, subno);
482 1.39 christos (void)MUSTEAT(')', REG_EPAREN);
483 1.1 jtc break;
484 1.1 jtc #ifndef POSIX_MISTAKE
485 1.1 jtc case ')': /* happens only if no current unmatched ( */
486 1.1 jtc /*
487 1.1 jtc * You may ask, why the ifndef? Because I didn't notice
488 1.1 jtc * this until slightly too late for 1003.2, and none of the
489 1.1 jtc * other 1003.2 regular-expression reviewers noticed it at
490 1.1 jtc * all. So an unmatched ) is legal POSIX, at least until
491 1.1 jtc * we can get it fixed.
492 1.1 jtc */
493 1.1 jtc SETERROR(REG_EPAREN);
494 1.1 jtc break;
495 1.1 jtc #endif
496 1.1 jtc case '^':
497 1.1 jtc EMIT(OBOL, 0);
498 1.1 jtc p->g->iflags |= USEBOL;
499 1.1 jtc p->g->nbol++;
500 1.1 jtc wascaret = 1;
501 1.1 jtc break;
502 1.1 jtc case '$':
503 1.1 jtc EMIT(OEOL, 0);
504 1.1 jtc p->g->iflags |= USEEOL;
505 1.1 jtc p->g->neol++;
506 1.1 jtc break;
507 1.1 jtc case '|':
508 1.1 jtc SETERROR(REG_EMPTY);
509 1.1 jtc break;
510 1.1 jtc case '*':
511 1.1 jtc case '+':
512 1.1 jtc case '?':
513 1.39 christos case '{':
514 1.1 jtc SETERROR(REG_BADRPT);
515 1.1 jtc break;
516 1.1 jtc case '.':
517 1.1 jtc if (p->g->cflags®_NEWLINE)
518 1.1 jtc nonnewline(p);
519 1.1 jtc else
520 1.1 jtc EMIT(OANY, 0);
521 1.1 jtc break;
522 1.1 jtc case '[':
523 1.1 jtc p_bracket(p);
524 1.1 jtc break;
525 1.1 jtc case '\\':
526 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
527 1.39 christos wc = WGETNEXT();
528 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
529 1.39 christos if (p->gnuext) {
530 1.39 christos handled = 1;
531 1.39 christos switch (wc) {
532 1.39 christos case '`':
533 1.39 christos EMIT(OBOS, 0);
534 1.39 christos break;
535 1.39 christos case '\'':
536 1.39 christos EMIT(OEOS, 0);
537 1.39 christos break;
538 1.39 christos case 'B':
539 1.39 christos EMIT(ONWBND, 0);
540 1.39 christos break;
541 1.39 christos case 'b':
542 1.39 christos EMIT(OWBND, 0);
543 1.39 christos break;
544 1.39 christos case 'W':
545 1.39 christos case 'w':
546 1.39 christos case 'S':
547 1.39 christos case 's':
548 1.39 christos p_b_pseudoclass(p, wc);
549 1.39 christos break;
550 1.39 christos case '1':
551 1.39 christos case '2':
552 1.39 christos case '3':
553 1.39 christos case '4':
554 1.39 christos case '5':
555 1.39 christos case '6':
556 1.39 christos case '7':
557 1.39 christos case '8':
558 1.39 christos case '9':
559 1.39 christos i = wc - '0';
560 1.39 christos assert(i < NPAREN);
561 1.39 christos if (p->pend[i] != 0) {
562 1.39 christos assert(i <= p->g->nsub);
563 1.39 christos EMIT(OBACK_, i);
564 1.39 christos assert(p->pbegin[i] != 0);
565 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
566 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
567 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
568 1.39 christos EMIT(O_BACK, i);
569 1.39 christos } else
570 1.39 christos SETERROR(REG_ESUBREG);
571 1.39 christos p->g->backrefs = 1;
572 1.39 christos break;
573 1.39 christos default:
574 1.39 christos handled = 0;
575 1.39 christos }
576 1.39 christos /* Don't proceed to the POSIX bits if we've already handled it */
577 1.39 christos if (handled)
578 1.39 christos break;
579 1.39 christos }
580 1.39 christos #endif
581 1.39 christos switch (wc) {
582 1.39 christos case '<':
583 1.39 christos EMIT(OBOW, 0);
584 1.39 christos break;
585 1.39 christos case '>':
586 1.39 christos EMIT(OEOW, 0);
587 1.39 christos break;
588 1.39 christos default:
589 1.39 christos if (may_escape(p, wc))
590 1.39 christos ordinary(p, wc);
591 1.39 christos else
592 1.39 christos SETERROR(REG_EESCAPE);
593 1.39 christos break;
594 1.39 christos }
595 1.1 jtc break;
596 1.1 jtc default:
597 1.38 christos if (p->error != 0)
598 1.39 christos return (false);
599 1.39 christos p->next--;
600 1.39 christos wc = WGETNEXT();
601 1.39 christos ordinary(p, wc);
602 1.1 jtc break;
603 1.1 jtc }
604 1.1 jtc
605 1.1 jtc if (!MORE())
606 1.39 christos return (false);
607 1.1 jtc c = PEEK();
608 1.1 jtc /* we call { a repetition if followed by a digit */
609 1.39 christos if (!( c == '*' || c == '+' || c == '?' || c == '{'))
610 1.39 christos return (false); /* no repetition, we're done */
611 1.39 christos else if (c == '{')
612 1.39 christos (void)REQUIRE(MORE2() && \
613 1.39 christos (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
614 1.1 jtc NEXT();
615 1.1 jtc
616 1.39 christos (void)REQUIRE(!wascaret, REG_BADRPT);
617 1.1 jtc switch (c) {
618 1.1 jtc case '*': /* implemented as +? */
619 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
620 1.1 jtc INSERT(OPLUS_, pos);
621 1.1 jtc ASTERN(O_PLUS, pos);
622 1.1 jtc INSERT(OQUEST_, pos);
623 1.1 jtc ASTERN(O_QUEST, pos);
624 1.1 jtc break;
625 1.1 jtc case '+':
626 1.1 jtc INSERT(OPLUS_, pos);
627 1.1 jtc ASTERN(O_PLUS, pos);
628 1.1 jtc break;
629 1.1 jtc case '?':
630 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
631 1.4 jtc INSERT(OCH_, pos); /* offset slightly wrong */
632 1.4 jtc ASTERN(OOR1, pos); /* this one's right */
633 1.4 jtc AHEAD(pos); /* fix the OCH_ */
634 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
635 1.4 jtc AHEAD(THERE()); /* ...so fix it */
636 1.4 jtc ASTERN(O_CH, THERETHERE());
637 1.1 jtc break;
638 1.1 jtc case '{':
639 1.1 jtc count = p_count(p);
640 1.1 jtc if (EAT(',')) {
641 1.39 christos if (isdigit((uch)PEEK())) {
642 1.1 jtc count2 = p_count(p);
643 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
644 1.1 jtc } else /* single number with comma */
645 1.1 jtc count2 = INFINITY;
646 1.1 jtc } else /* just a single number */
647 1.1 jtc count2 = count;
648 1.39 christos repeat(p, pos, count, count2);
649 1.1 jtc if (!EAT('}')) { /* error heuristics */
650 1.1 jtc while (MORE() && PEEK() != '}')
651 1.1 jtc NEXT();
652 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
653 1.1 jtc SETERROR(REG_BADBR);
654 1.1 jtc }
655 1.1 jtc break;
656 1.1 jtc }
657 1.1 jtc
658 1.1 jtc if (!MORE())
659 1.39 christos return (false);
660 1.1 jtc c = PEEK();
661 1.1 jtc if (!( c == '*' || c == '+' || c == '?' ||
662 1.39 christos (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
663 1.39 christos return (false);
664 1.1 jtc SETERROR(REG_BADRPT);
665 1.39 christos return (false);
666 1.1 jtc }
667 1.1 jtc
668 1.1 jtc /*
669 1.1 jtc - p_str - string (no metacharacters) "parser"
670 1.9 perry == static void p_str(struct parse *p);
671 1.1 jtc */
672 1.1 jtc static void
673 1.39 christos p_str(struct parse *p)
674 1.1 jtc {
675 1.39 christos (void)REQUIRE(MORE(), REG_EMPTY);
676 1.39 christos while (MORE())
677 1.39 christos ordinary(p, WGETNEXT());
678 1.39 christos }
679 1.14 lukem
680 1.39 christos /*
681 1.39 christos * Eat consecutive branch delimiters for the kind of expression that we are
682 1.39 christos * parsing, return the number of delimiters that we ate.
683 1.39 christos */
684 1.39 christos static int
685 1.39 christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
686 1.39 christos {
687 1.39 christos int nskip;
688 1.14 lukem
689 1.39 christos (void)bc;
690 1.39 christos nskip = 0;
691 1.39 christos while (EATSPEC('|'))
692 1.39 christos ++nskip;
693 1.39 christos return (nskip);
694 1.1 jtc }
695 1.1 jtc
696 1.1 jtc /*
697 1.39 christos * Insert necessary branch book-keeping operations. This emits a
698 1.39 christos * bogus 'next' offset, since we still have more to parse
699 1.1 jtc */
700 1.1 jtc static void
701 1.39 christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
702 1.9 perry {
703 1.1 jtc
704 1.39 christos if (bc->nbranch == 0) {
705 1.39 christos INSERT(OCH_, bc->start); /* offset is wrong */
706 1.39 christos bc->fwd = bc->start;
707 1.39 christos bc->back = bc->start;
708 1.39 christos }
709 1.39 christos
710 1.39 christos ASTERN(OOR1, bc->back);
711 1.39 christos bc->back = THERE();
712 1.39 christos AHEAD(bc->fwd); /* fix previous offset */
713 1.39 christos bc->fwd = HERE();
714 1.39 christos EMIT(OOR2, 0); /* offset is very wrong */
715 1.39 christos ++bc->nbranch;
716 1.39 christos }
717 1.39 christos
718 1.39 christos /*
719 1.39 christos * Fix the offset of the tail branch, if we actually had any branches.
720 1.39 christos * This is to correct the bogus placeholder offset that we use.
721 1.39 christos */
722 1.39 christos static void
723 1.39 christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
724 1.39 christos {
725 1.14 lukem
726 1.39 christos /* Fix bogus offset at the tail if we actually have branches */
727 1.39 christos if (bc->nbranch > 0) {
728 1.39 christos AHEAD(bc->fwd);
729 1.39 christos ASTERN(O_CH, bc->back);
730 1.30 christos }
731 1.39 christos }
732 1.39 christos
733 1.39 christos /*
734 1.39 christos * Signal to the parser that an empty branch has been encountered; this will,
735 1.39 christos * in the future, be used to allow for more permissive behavior with empty
736 1.39 christos * branches. The return value should indicate whether parsing may continue
737 1.39 christos * or not.
738 1.39 christos */
739 1.39 christos static bool
740 1.39 christos p_branch_empty(struct parse *p, struct branchc *bc)
741 1.39 christos {
742 1.39 christos
743 1.39 christos (void)bc;
744 1.39 christos SETERROR(REG_EMPTY);
745 1.39 christos return (false);
746 1.39 christos }
747 1.39 christos
748 1.39 christos /*
749 1.39 christos * Take care of any branching requirements. This includes inserting the
750 1.39 christos * appropriate branching instructions as well as eating all of the branch
751 1.39 christos * delimiters until we either run out of pattern or need to parse more pattern.
752 1.39 christos */
753 1.39 christos static bool
754 1.39 christos p_branch_do(struct parse *p, struct branchc *bc)
755 1.39 christos {
756 1.39 christos int ate = 0;
757 1.39 christos
758 1.39 christos ate = p_branch_eat_delim(p, bc);
759 1.39 christos if (ate == 0)
760 1.39 christos return (false);
761 1.39 christos else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
762 1.39 christos /*
763 1.39 christos * Halt parsing only if we have an empty branch and p_branch_empty
764 1.39 christos * indicates that we must not continue. In the future, this will not
765 1.39 christos * necessarily be an error.
766 1.39 christos */
767 1.39 christos return (false);
768 1.39 christos p_branch_ins_offset(p, bc);
769 1.39 christos
770 1.39 christos return (true);
771 1.39 christos }
772 1.30 christos
773 1.39 christos static void
774 1.39 christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
775 1.39 christos {
776 1.14 lukem
777 1.40 christos (void)bc;
778 1.39 christos /*
779 1.39 christos * Does not move cleanly into expression parser because of
780 1.39 christos * ordinary interpration of * at the beginning position of
781 1.39 christos * an expression.
782 1.39 christos */
783 1.1 jtc if (EAT('^')) {
784 1.1 jtc EMIT(OBOL, 0);
785 1.1 jtc p->g->iflags |= USEBOL;
786 1.1 jtc p->g->nbol++;
787 1.1 jtc }
788 1.39 christos }
789 1.39 christos
790 1.39 christos static void
791 1.39 christos p_bre_post_parse(struct parse *p, struct branchc *bc)
792 1.39 christos {
793 1.39 christos
794 1.39 christos /* Expression is terminating due to EOL token */
795 1.39 christos if (bc->terminate) {
796 1.1 jtc DROP(1);
797 1.1 jtc EMIT(OEOL, 0);
798 1.1 jtc p->g->iflags |= USEEOL;
799 1.1 jtc p->g->neol++;
800 1.1 jtc }
801 1.39 christos }
802 1.1 jtc
803 1.39 christos /*
804 1.39 christos - p_re - Top level parser, concatenation and BRE anchoring
805 1.39 christos == static void p_re(struct parse *p, int end1, int end2);
806 1.39 christos * Giving end1 as OUT essentially eliminates the end1/end2 check.
807 1.39 christos *
808 1.39 christos * This implementation is a bit of a kludge, in that a trailing $ is first
809 1.39 christos * taken as an ordinary character and then revised to be an anchor.
810 1.39 christos * The amount of lookahead needed to avoid this kludge is excessive.
811 1.39 christos */
812 1.39 christos static void
813 1.39 christos p_re(struct parse *p,
814 1.39 christos int end1, /* first terminating character */
815 1.39 christos int end2) /* second terminating character; ignored for EREs */
816 1.39 christos {
817 1.39 christos struct branchc bc;
818 1.39 christos
819 1.39 christos bc.nbranch = 0;
820 1.39 christos if (end1 == OUT && end2 == OUT)
821 1.39 christos bc.outer = true;
822 1.39 christos else
823 1.39 christos bc.outer = false;
824 1.39 christos #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
825 1.39 christos for (;;) {
826 1.39 christos bc.start = HERE();
827 1.39 christos bc.nchain = 0;
828 1.39 christos bc.terminate = false;
829 1.39 christos if (p->pre_parse != NULL)
830 1.39 christos p->pre_parse(p, &bc);
831 1.39 christos while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
832 1.39 christos bc.terminate = p->parse_expr(p, &bc);
833 1.39 christos ++bc.nchain;
834 1.39 christos }
835 1.39 christos if (p->post_parse != NULL)
836 1.39 christos p->post_parse(p, &bc);
837 1.39 christos (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
838 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
839 1.39 christos if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
840 1.39 christos break;
841 1.39 christos #endif
842 1.39 christos if (!p->allowbranch)
843 1.39 christos break;
844 1.39 christos /*
845 1.39 christos * p_branch_do's return value indicates whether we should
846 1.39 christos * continue parsing or not. This is both for correctness and
847 1.39 christos * a slight optimization, because it will check if we've
848 1.39 christos * encountered an empty branch or the end of the string
849 1.39 christos * immediately following a branch delimiter.
850 1.39 christos */
851 1.39 christos if (!p_branch_do(p, &bc))
852 1.39 christos break;
853 1.39 christos }
854 1.39 christos #undef SEE_END
855 1.39 christos if (p->allowbranch)
856 1.39 christos p_branch_fix_tail(p, &bc);
857 1.39 christos assert(!MORE() || SEE(end1));
858 1.1 jtc }
859 1.1 jtc
860 1.1 jtc /*
861 1.1 jtc - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
862 1.39 christos == static bool p_simp_re(struct parse *p, struct branchc *bc);
863 1.1 jtc */
864 1.39 christos static bool /* was the simple RE an unbackslashed $? */
865 1.39 christos p_simp_re(struct parse *p, struct branchc *bc)
866 1.1 jtc {
867 1.9 perry int c;
868 1.39 christos int cc; /* convenient/control character */
869 1.9 perry int count;
870 1.9 perry int count2;
871 1.39 christos sopno pos;
872 1.39 christos bool handled;
873 1.40 christos size_t i;
874 1.39 christos wint_t wc;
875 1.9 perry sopno subno;
876 1.1 jtc # define BACKSL (1<<CHAR_BIT)
877 1.1 jtc
878 1.39 christos pos = HERE(); /* repetition op, if any, covers from here */
879 1.39 christos handled = false;
880 1.1 jtc
881 1.1 jtc assert(MORE()); /* caller should have ensured this */
882 1.1 jtc c = GETNEXT();
883 1.1 jtc if (c == '\\') {
884 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
885 1.39 christos cc = GETNEXT();
886 1.39 christos c = BACKSL | cc;
887 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
888 1.39 christos if (p->gnuext) {
889 1.39 christos handled = true;
890 1.39 christos switch (c) {
891 1.39 christos case BACKSL|'`':
892 1.39 christos EMIT(OBOS, 0);
893 1.39 christos break;
894 1.39 christos case BACKSL|'\'':
895 1.39 christos EMIT(OEOS, 0);
896 1.39 christos break;
897 1.39 christos case BACKSL|'B':
898 1.39 christos EMIT(ONWBND, 0);
899 1.39 christos break;
900 1.39 christos case BACKSL|'b':
901 1.39 christos EMIT(OWBND, 0);
902 1.39 christos break;
903 1.39 christos case BACKSL|'W':
904 1.39 christos case BACKSL|'w':
905 1.39 christos case BACKSL|'S':
906 1.39 christos case BACKSL|'s':
907 1.39 christos p_b_pseudoclass(p, cc);
908 1.39 christos break;
909 1.39 christos default:
910 1.39 christos handled = false;
911 1.39 christos }
912 1.39 christos }
913 1.39 christos #endif
914 1.1 jtc }
915 1.39 christos if (!handled) {
916 1.39 christos switch (c) {
917 1.39 christos case '.':
918 1.39 christos if (p->g->cflags®_NEWLINE)
919 1.39 christos nonnewline(p);
920 1.39 christos else
921 1.39 christos EMIT(OANY, 0);
922 1.39 christos break;
923 1.39 christos case '[':
924 1.39 christos p_bracket(p);
925 1.39 christos break;
926 1.39 christos case BACKSL|'<':
927 1.39 christos EMIT(OBOW, 0);
928 1.39 christos break;
929 1.39 christos case BACKSL|'>':
930 1.39 christos EMIT(OEOW, 0);
931 1.39 christos break;
932 1.39 christos case BACKSL|'{':
933 1.39 christos SETERROR(REG_BADRPT);
934 1.39 christos break;
935 1.39 christos case BACKSL|'(':
936 1.39 christos p->g->nsub++;
937 1.40 christos subno = (sopno)p->g->nsub;
938 1.39 christos if (subno < NPAREN)
939 1.39 christos p->pbegin[subno] = HERE();
940 1.39 christos EMIT(OLPAREN, subno);
941 1.39 christos /* the MORE here is an error heuristic */
942 1.39 christos if (MORE() && !SEETWO('\\', ')'))
943 1.39 christos p_re(p, '\\', ')');
944 1.39 christos if (subno < NPAREN) {
945 1.39 christos p->pend[subno] = HERE();
946 1.39 christos assert(p->pend[subno] != 0);
947 1.39 christos }
948 1.39 christos EMIT(ORPAREN, subno);
949 1.39 christos (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
950 1.39 christos break;
951 1.39 christos case BACKSL|')': /* should not get here -- must be user */
952 1.39 christos SETERROR(REG_EPAREN);
953 1.39 christos break;
954 1.39 christos case BACKSL|'1':
955 1.39 christos case BACKSL|'2':
956 1.39 christos case BACKSL|'3':
957 1.39 christos case BACKSL|'4':
958 1.39 christos case BACKSL|'5':
959 1.39 christos case BACKSL|'6':
960 1.39 christos case BACKSL|'7':
961 1.39 christos case BACKSL|'8':
962 1.39 christos case BACKSL|'9':
963 1.39 christos i = (c&~BACKSL) - '0';
964 1.39 christos assert(i < NPAREN);
965 1.39 christos if (p->pend[i] != 0) {
966 1.39 christos assert(i <= p->g->nsub);
967 1.39 christos EMIT(OBACK_, i);
968 1.39 christos assert(p->pbegin[i] != 0);
969 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
970 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
971 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
972 1.39 christos EMIT(O_BACK, i);
973 1.39 christos } else
974 1.39 christos SETERROR(REG_ESUBREG);
975 1.39 christos p->g->backrefs = 1;
976 1.39 christos break;
977 1.39 christos case '*':
978 1.39 christos /*
979 1.39 christos * Ordinary if used as the first character beyond BOL anchor of
980 1.39 christos * a (sub-)expression, counts as a bad repetition operator if it
981 1.39 christos * appears otherwise.
982 1.39 christos */
983 1.39 christos (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
984 1.39 christos /* FALLTHROUGH */
985 1.39 christos default:
986 1.39 christos if (p->error != 0)
987 1.39 christos return (false); /* Definitely not $... */
988 1.39 christos p->next--;
989 1.39 christos wc = WGETNEXT();
990 1.39 christos if ((c & BACKSL) == 0 || may_escape(p, wc))
991 1.39 christos ordinary(p, wc);
992 1.39 christos else
993 1.39 christos SETERROR(REG_EESCAPE);
994 1.39 christos break;
995 1.1 jtc }
996 1.1 jtc }
997 1.1 jtc
998 1.1 jtc if (EAT('*')) { /* implemented as +? */
999 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
1000 1.1 jtc INSERT(OPLUS_, pos);
1001 1.1 jtc ASTERN(O_PLUS, pos);
1002 1.1 jtc INSERT(OQUEST_, pos);
1003 1.1 jtc ASTERN(O_QUEST, pos);
1004 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1005 1.39 christos } else if (p->gnuext && EATTWO('\\', '?')) {
1006 1.39 christos INSERT(OQUEST_, pos);
1007 1.39 christos ASTERN(O_QUEST, pos);
1008 1.39 christos } else if (p->gnuext && EATTWO('\\', '+')) {
1009 1.39 christos INSERT(OPLUS_, pos);
1010 1.39 christos ASTERN(O_PLUS, pos);
1011 1.39 christos #endif
1012 1.1 jtc } else if (EATTWO('\\', '{')) {
1013 1.1 jtc count = p_count(p);
1014 1.1 jtc if (EAT(',')) {
1015 1.39 christos if (MORE() && isdigit((uch)PEEK())) {
1016 1.1 jtc count2 = p_count(p);
1017 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
1018 1.1 jtc } else /* single number with comma */
1019 1.1 jtc count2 = INFINITY;
1020 1.1 jtc } else /* just a single number */
1021 1.1 jtc count2 = count;
1022 1.39 christos repeat(p, pos, count, count2);
1023 1.1 jtc if (!EATTWO('\\', '}')) { /* error heuristics */
1024 1.1 jtc while (MORE() && !SEETWO('\\', '}'))
1025 1.1 jtc NEXT();
1026 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
1027 1.1 jtc SETERROR(REG_BADBR);
1028 1.1 jtc }
1029 1.39 christos } else if (c == '$') /* $ (but not \$) ends it */
1030 1.39 christos return (true);
1031 1.1 jtc
1032 1.39 christos return (false);
1033 1.1 jtc }
1034 1.1 jtc
1035 1.1 jtc /*
1036 1.1 jtc - p_count - parse a repetition count
1037 1.9 perry == static int p_count(struct parse *p);
1038 1.1 jtc */
1039 1.1 jtc static int /* the value */
1040 1.39 christos p_count(struct parse *p)
1041 1.1 jtc {
1042 1.9 perry int count = 0;
1043 1.9 perry int ndigits = 0;
1044 1.1 jtc
1045 1.39 christos while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1046 1.1 jtc count = count*10 + (GETNEXT() - '0');
1047 1.1 jtc ndigits++;
1048 1.1 jtc }
1049 1.1 jtc
1050 1.39 christos (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1051 1.1 jtc return(count);
1052 1.1 jtc }
1053 1.1 jtc
1054 1.1 jtc /*
1055 1.1 jtc - p_bracket - parse a bracketed character list
1056 1.9 perry == static void p_bracket(struct parse *p);
1057 1.1 jtc */
1058 1.1 jtc static void
1059 1.39 christos p_bracket(struct parse *p)
1060 1.1 jtc {
1061 1.14 lukem cset *cs;
1062 1.39 christos wint_t ch;
1063 1.14 lukem
1064 1.1 jtc /* Dept of Truly Sickening Special-Case Kludges */
1065 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1066 1.1 jtc EMIT(OBOW, 0);
1067 1.1 jtc NEXTn(6);
1068 1.1 jtc return;
1069 1.1 jtc }
1070 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1071 1.1 jtc EMIT(OEOW, 0);
1072 1.1 jtc NEXTn(6);
1073 1.1 jtc return;
1074 1.1 jtc }
1075 1.1 jtc
1076 1.39 christos if ((cs = allocset(p)) == NULL)
1077 1.39 christos return;
1078 1.39 christos
1079 1.39 christos if (p->g->cflags®_ICASE)
1080 1.39 christos cs->icase = 1;
1081 1.1 jtc if (EAT('^'))
1082 1.39 christos cs->invert = 1;
1083 1.1 jtc if (EAT(']'))
1084 1.39 christos CHadd(p, cs, ']');
1085 1.1 jtc else if (EAT('-'))
1086 1.39 christos CHadd(p, cs, '-');
1087 1.1 jtc while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1088 1.1 jtc p_b_term(p, cs);
1089 1.1 jtc if (EAT('-'))
1090 1.39 christos CHadd(p, cs, '-');
1091 1.39 christos (void)MUSTEAT(']', REG_EBRACK);
1092 1.1 jtc
1093 1.1 jtc if (p->error != 0) /* don't mess things up further */
1094 1.1 jtc return;
1095 1.1 jtc
1096 1.39 christos if (cs->invert && p->g->cflags®_NEWLINE)
1097 1.39 christos cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1098 1.1 jtc
1099 1.39 christos if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1100 1.39 christos ordinary(p, ch);
1101 1.39 christos freeset(p, cs);
1102 1.39 christos } else
1103 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1104 1.39 christos }
1105 1.1 jtc
1106 1.39 christos static int
1107 1.39 christos p_range_cmp(wchar_t c1, wchar_t c2)
1108 1.39 christos {
1109 1.39 christos #ifdef REGEX_LIBC_COLLATE
1110 1.39 christos return __wcollate_range_cmp(c1, c2);
1111 1.42 christos #elif defined(NLS)
1112 1.39 christos /* Copied from libc/collate __wcollate_range_cmp */
1113 1.39 christos wchar_t s1[2], s2[2];
1114 1.1 jtc
1115 1.39 christos s1[0] = c1;
1116 1.39 christos s1[1] = L'\0';
1117 1.39 christos s2[0] = c2;
1118 1.39 christos s2[1] = L'\0';
1119 1.42 christos return wcscoll(s1, s2);
1120 1.42 christos #else
1121 1.42 christos char s1[2], s2[2];
1122 1.42 christos
1123 1.42 christos s1[0] = (char)c1;
1124 1.42 christos s1[1] = '\0';
1125 1.42 christos s2[0] = (char)c2;
1126 1.42 christos s2[1] = '\0';
1127 1.42 christos return strcoll(s1, s2);
1128 1.39 christos #endif
1129 1.1 jtc }
1130 1.1 jtc
1131 1.1 jtc /*
1132 1.1 jtc - p_b_term - parse one term of a bracketed character list
1133 1.9 perry == static void p_b_term(struct parse *p, cset *cs);
1134 1.1 jtc */
1135 1.1 jtc static void
1136 1.39 christos p_b_term(struct parse *p, cset *cs)
1137 1.1 jtc {
1138 1.9 perry char c;
1139 1.39 christos wint_t start, finish;
1140 1.39 christos wint_t i;
1141 1.39 christos #ifdef REGEX_LIBC_COLLATE
1142 1.39 christos struct xlocale_collate *table =
1143 1.39 christos (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1144 1.39 christos #endif
1145 1.1 jtc
1146 1.14 lukem _DIAGASSERT(p != NULL);
1147 1.14 lukem _DIAGASSERT(cs != NULL);
1148 1.14 lukem
1149 1.1 jtc /* classify what we've got */
1150 1.1 jtc switch ((MORE()) ? PEEK() : '\0') {
1151 1.1 jtc case '[':
1152 1.1 jtc c = (MORE2()) ? PEEK2() : '\0';
1153 1.1 jtc break;
1154 1.1 jtc case '-':
1155 1.1 jtc SETERROR(REG_ERANGE);
1156 1.1 jtc return; /* NOTE RETURN */
1157 1.1 jtc default:
1158 1.1 jtc c = '\0';
1159 1.1 jtc break;
1160 1.1 jtc }
1161 1.1 jtc
1162 1.1 jtc switch (c) {
1163 1.1 jtc case ':': /* character class */
1164 1.1 jtc NEXT2();
1165 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1166 1.1 jtc c = PEEK();
1167 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1168 1.1 jtc p_b_cclass(p, cs);
1169 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1170 1.39 christos (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1171 1.1 jtc break;
1172 1.1 jtc case '=': /* equivalence class */
1173 1.1 jtc NEXT2();
1174 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1175 1.1 jtc c = PEEK();
1176 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1177 1.1 jtc p_b_eclass(p, cs);
1178 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1179 1.39 christos (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1180 1.1 jtc break;
1181 1.1 jtc default: /* symbol, ordinary character, or range */
1182 1.1 jtc start = p_b_symbol(p);
1183 1.1 jtc if (SEE('-') && MORE2() && PEEK2() != ']') {
1184 1.1 jtc /* range */
1185 1.1 jtc NEXT();
1186 1.1 jtc if (EAT('-'))
1187 1.1 jtc finish = '-';
1188 1.1 jtc else
1189 1.1 jtc finish = p_b_symbol(p);
1190 1.1 jtc } else
1191 1.1 jtc finish = start;
1192 1.39 christos if (start == finish)
1193 1.39 christos CHadd(p, cs, start);
1194 1.39 christos else {
1195 1.39 christos #ifdef REGEX_LIBC_COLLATE
1196 1.39 christos if (table->__collate_load_error || MB_CUR_MAX > 1) {
1197 1.39 christos #else
1198 1.39 christos if (MB_CUR_MAX > 1) {
1199 1.39 christos #endif
1200 1.39 christos (void)REQUIRE(start <= finish, REG_ERANGE);
1201 1.39 christos CHaddrange(p, cs, start, finish);
1202 1.39 christos } else {
1203 1.39 christos (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1204 1.39 christos for (i = 0; i <= UCHAR_MAX; i++) {
1205 1.39 christos if (p_range_cmp(start, i) <= 0 &&
1206 1.39 christos p_range_cmp(i, finish) <= 0 )
1207 1.39 christos CHadd(p, cs, i);
1208 1.39 christos }
1209 1.39 christos }
1210 1.39 christos }
1211 1.39 christos break;
1212 1.39 christos }
1213 1.39 christos }
1214 1.39 christos
1215 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1216 1.39 christos /*
1217 1.39 christos - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1218 1.39 christos == static int p_b_pseudoclass(struct parse *p, char c)
1219 1.39 christos */
1220 1.39 christos static int
1221 1.39 christos p_b_pseudoclass(struct parse *p, char c) {
1222 1.39 christos cset *cs;
1223 1.39 christos
1224 1.39 christos if ((cs = allocset(p)) == NULL)
1225 1.39 christos return(0);
1226 1.39 christos
1227 1.39 christos if (p->g->cflags®_ICASE)
1228 1.39 christos cs->icase = 1;
1229 1.39 christos
1230 1.39 christos switch (c) {
1231 1.39 christos case 'W':
1232 1.39 christos cs->invert = 1;
1233 1.39 christos /* FALLTHROUGH */
1234 1.39 christos case 'w':
1235 1.39 christos p_b_cclass_named(p, cs, "alnum");
1236 1.1 jtc break;
1237 1.39 christos case 'S':
1238 1.39 christos cs->invert = 1;
1239 1.39 christos /* FALLTHROUGH */
1240 1.39 christos case 's':
1241 1.39 christos p_b_cclass_named(p, cs, "space");
1242 1.39 christos break;
1243 1.39 christos default:
1244 1.39 christos return(0);
1245 1.1 jtc }
1246 1.39 christos
1247 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1248 1.39 christos return(1);
1249 1.1 jtc }
1250 1.39 christos #endif
1251 1.1 jtc
1252 1.1 jtc /*
1253 1.1 jtc - p_b_cclass - parse a character-class name and deal with it
1254 1.9 perry == static void p_b_cclass(struct parse *p, cset *cs);
1255 1.1 jtc */
1256 1.1 jtc static void
1257 1.39 christos p_b_cclass(struct parse *p, cset *cs)
1258 1.1 jtc {
1259 1.39 christos const char *sp = p->next;
1260 1.9 perry size_t len;
1261 1.39 christos char clname[16];
1262 1.1 jtc
1263 1.39 christos while (MORE() && isalpha((uch)PEEK()))
1264 1.1 jtc NEXT();
1265 1.1 jtc len = p->next - sp;
1266 1.39 christos if (len >= sizeof(clname) - 1) {
1267 1.1 jtc SETERROR(REG_ECTYPE);
1268 1.1 jtc return;
1269 1.1 jtc }
1270 1.39 christos memcpy(clname, sp, len);
1271 1.39 christos clname[len] = '\0';
1272 1.39 christos
1273 1.39 christos p_b_cclass_named(p, cs, clname);
1274 1.39 christos }
1275 1.42 christos
1276 1.39 christos /*
1277 1.39 christos - p_b_cclass_named - deal with a named character class
1278 1.39 christos == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1279 1.39 christos */
1280 1.39 christos static void
1281 1.39 christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1282 1.39 christos wctype_t wct;
1283 1.1 jtc
1284 1.39 christos if ((wct = wctype(clname)) == 0) {
1285 1.39 christos SETERROR(REG_ECTYPE);
1286 1.39 christos return;
1287 1.39 christos }
1288 1.39 christos CHaddtype(p, cs, wct);
1289 1.1 jtc }
1290 1.1 jtc
1291 1.1 jtc /*
1292 1.1 jtc - p_b_eclass - parse an equivalence-class name and deal with it
1293 1.9 perry == static void p_b_eclass(struct parse *p, cset *cs);
1294 1.1 jtc *
1295 1.1 jtc * This implementation is incomplete. xxx
1296 1.1 jtc */
1297 1.1 jtc static void
1298 1.39 christos p_b_eclass(struct parse *p, cset *cs)
1299 1.1 jtc {
1300 1.39 christos wint_t c;
1301 1.1 jtc
1302 1.14 lukem _DIAGASSERT(p != NULL);
1303 1.14 lukem _DIAGASSERT(cs != NULL);
1304 1.14 lukem
1305 1.1 jtc c = p_b_coll_elem(p, '=');
1306 1.39 christos CHadd(p, cs, c);
1307 1.1 jtc }
1308 1.1 jtc
1309 1.1 jtc /*
1310 1.1 jtc - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1311 1.39 christos == static wint_t p_b_symbol(struct parse *p);
1312 1.1 jtc */
1313 1.39 christos static wint_t /* value of symbol */
1314 1.39 christos p_b_symbol(struct parse *p)
1315 1.1 jtc {
1316 1.39 christos wint_t value;
1317 1.1 jtc
1318 1.14 lukem _DIAGASSERT(p != NULL);
1319 1.14 lukem
1320 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1321 1.1 jtc if (!EATTWO('[', '.'))
1322 1.39 christos return(WGETNEXT());
1323 1.1 jtc
1324 1.1 jtc /* collating symbol */
1325 1.1 jtc value = p_b_coll_elem(p, '.');
1326 1.39 christos (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1327 1.1 jtc return(value);
1328 1.1 jtc }
1329 1.1 jtc
1330 1.1 jtc /*
1331 1.1 jtc - p_b_coll_elem - parse a collating-element name and look it up
1332 1.39 christos == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1333 1.1 jtc */
1334 1.39 christos static wint_t /* value of collating element */
1335 1.39 christos p_b_coll_elem(struct parse *p,
1336 1.39 christos wint_t endc) /* name ended by endc,']' */
1337 1.39 christos {
1338 1.39 christos const char *sp = p->next;
1339 1.39 christos struct cname *cp;
1340 1.42 christos size_t len;
1341 1.1 jtc
1342 1.14 lukem _DIAGASSERT(p != NULL);
1343 1.14 lukem
1344 1.1 jtc while (MORE() && !SEETWO(endc, ']'))
1345 1.1 jtc NEXT();
1346 1.1 jtc if (!MORE()) {
1347 1.1 jtc SETERROR(REG_EBRACK);
1348 1.1 jtc return(0);
1349 1.1 jtc }
1350 1.1 jtc len = p->next - sp;
1351 1.1 jtc for (cp = cnames; cp->name != NULL; cp++)
1352 1.37 christos if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1353 1.1 jtc return(cp->code); /* known name */
1354 1.42 christos #ifdef NLS
1355 1.42 christos mbstate_t mbs;
1356 1.42 christos wchar_t wc;
1357 1.42 christos size_t clen;
1358 1.42 christos
1359 1.39 christos memset(&mbs, 0, sizeof(mbs));
1360 1.39 christos if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1361 1.39 christos return (wc); /* single character */
1362 1.39 christos else if (clen == (size_t)-1 || clen == (size_t)-2)
1363 1.39 christos SETERROR(REG_ILLSEQ);
1364 1.39 christos else
1365 1.39 christos SETERROR(REG_ECOLLATE); /* neither */
1366 1.1 jtc return(0);
1367 1.42 christos #else
1368 1.42 christos if (len == 1)
1369 1.42 christos return *sp; /* single character */
1370 1.42 christos SETERROR(REG_ECOLLATE); /* neither */
1371 1.42 christos return 0;
1372 1.42 christos #endif
1373 1.1 jtc }
1374 1.1 jtc
1375 1.1 jtc /*
1376 1.39 christos - may_escape - determine whether 'ch' is escape-able in the current context
1377 1.39 christos == static int may_escape(struct parse *p, const wint_t ch)
1378 1.39 christos */
1379 1.39 christos static bool
1380 1.39 christos may_escape(struct parse *p, const wint_t ch)
1381 1.39 christos {
1382 1.39 christos
1383 1.39 christos if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1384 1.39 christos return (true);
1385 1.39 christos if (isalpha(ch) || ch == '\'' || ch == '`')
1386 1.39 christos return (false);
1387 1.39 christos return (true);
1388 1.39 christos #ifdef NOTYET
1389 1.39 christos /*
1390 1.39 christos * Build a whitelist of characters that may be escaped to produce an
1391 1.39 christos * ordinary in the current context. This assumes that these have not
1392 1.39 christos * been otherwise interpreted as a special character. Escaping an
1393 1.39 christos * ordinary character yields undefined results according to
1394 1.39 christos * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1395 1.39 christos * advantage of this and use escaped ordinary characters to provide
1396 1.39 christos * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1397 1.39 christos */
1398 1.39 christos switch(ch) {
1399 1.39 christos case '|':
1400 1.39 christos case '+':
1401 1.39 christos case '?':
1402 1.39 christos /* The above characters may not be escaped in BREs */
1403 1.39 christos if (!(p->g->cflags®_EXTENDED))
1404 1.39 christos return (false);
1405 1.39 christos /* Fallthrough */
1406 1.39 christos case '(':
1407 1.39 christos case ')':
1408 1.39 christos case '{':
1409 1.39 christos case '}':
1410 1.39 christos case '.':
1411 1.39 christos case '[':
1412 1.39 christos case ']':
1413 1.39 christos case '\\':
1414 1.39 christos case '*':
1415 1.39 christos case '^':
1416 1.39 christos case '$':
1417 1.39 christos return (true);
1418 1.39 christos default:
1419 1.39 christos return (false);
1420 1.39 christos }
1421 1.39 christos #endif
1422 1.39 christos }
1423 1.39 christos
1424 1.39 christos /*
1425 1.1 jtc - othercase - return the case counterpart of an alphabetic
1426 1.39 christos == static wint_t othercase(wint_t ch);
1427 1.1 jtc */
1428 1.39 christos static wint_t /* if no counterpart, return ch */
1429 1.39 christos othercase(wint_t ch)
1430 1.39 christos {
1431 1.39 christos assert(iswalpha(ch));
1432 1.39 christos if (iswupper(ch))
1433 1.39 christos return(towlower(ch));
1434 1.39 christos else if (iswlower(ch))
1435 1.39 christos return(towupper(ch));
1436 1.1 jtc else /* peculiar, but could happen */
1437 1.1 jtc return(ch);
1438 1.1 jtc }
1439 1.1 jtc
1440 1.1 jtc /*
1441 1.1 jtc - bothcases - emit a dualcase version of a two-case character
1442 1.39 christos == static void bothcases(struct parse *p, wint_t ch);
1443 1.1 jtc *
1444 1.1 jtc * Boy, is this implementation ever a kludge...
1445 1.1 jtc */
1446 1.1 jtc static void
1447 1.39 christos bothcases(struct parse *p, wint_t ch)
1448 1.1 jtc {
1449 1.39 christos const char *oldnext = p->next;
1450 1.39 christos const char *oldend = p->end;
1451 1.39 christos char bracket[3 + MB_LEN_MAX];
1452 1.39 christos size_t n;
1453 1.1 jtc
1454 1.14 lukem _DIAGASSERT(p != NULL);
1455 1.14 lukem
1456 1.1 jtc assert(othercase(ch) != ch); /* p_bracket() would recurse */
1457 1.1 jtc p->next = bracket;
1458 1.42 christos #ifdef NLS
1459 1.42 christos mbstate_t mbs;
1460 1.39 christos memset(&mbs, 0, sizeof(mbs));
1461 1.39 christos n = wcrtomb(bracket, ch, &mbs);
1462 1.39 christos assert(n != (size_t)-1);
1463 1.42 christos #else
1464 1.42 christos n = 0;
1465 1.42 christos bracket[n++] = ch;
1466 1.42 christos #endif
1467 1.39 christos bracket[n] = ']';
1468 1.39 christos bracket[n + 1] = '\0';
1469 1.39 christos p->end = bracket+n+1;
1470 1.1 jtc p_bracket(p);
1471 1.39 christos assert(p->next == p->end);
1472 1.1 jtc p->next = oldnext;
1473 1.1 jtc p->end = oldend;
1474 1.1 jtc }
1475 1.1 jtc
1476 1.1 jtc /*
1477 1.1 jtc - ordinary - emit an ordinary character
1478 1.39 christos == static void ordinary(struct parse *p, wint_t ch);
1479 1.1 jtc */
1480 1.1 jtc static void
1481 1.39 christos ordinary(struct parse *p, wint_t ch)
1482 1.1 jtc {
1483 1.39 christos cset *cs;
1484 1.1 jtc
1485 1.14 lukem _DIAGASSERT(p != NULL);
1486 1.14 lukem
1487 1.39 christos if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1488 1.39 christos bothcases(p, ch);
1489 1.40 christos else if ((wint_t)(ch & OPDMASK) == ch)
1490 1.40 christos EMIT(OCHAR, (size_t)ch);
1491 1.1 jtc else {
1492 1.39 christos /*
1493 1.39 christos * Kludge: character is too big to fit into an OCHAR operand.
1494 1.39 christos * Emit a singleton set.
1495 1.39 christos */
1496 1.39 christos if ((cs = allocset(p)) == NULL)
1497 1.39 christos return;
1498 1.39 christos CHadd(p, cs, ch);
1499 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1500 1.1 jtc }
1501 1.1 jtc }
1502 1.1 jtc
1503 1.1 jtc /*
1504 1.1 jtc - nonnewline - emit REG_NEWLINE version of OANY
1505 1.9 perry == static void nonnewline(struct parse *p);
1506 1.1 jtc *
1507 1.1 jtc * Boy, is this implementation ever a kludge...
1508 1.1 jtc */
1509 1.1 jtc static void
1510 1.39 christos nonnewline(struct parse *p)
1511 1.1 jtc {
1512 1.39 christos const char *oldnext = p->next;
1513 1.39 christos const char *oldend = p->end;
1514 1.1 jtc char bracket[4];
1515 1.1 jtc
1516 1.14 lukem _DIAGASSERT(p != NULL);
1517 1.14 lukem
1518 1.1 jtc p->next = bracket;
1519 1.1 jtc p->end = bracket+3;
1520 1.1 jtc bracket[0] = '^';
1521 1.1 jtc bracket[1] = '\n';
1522 1.1 jtc bracket[2] = ']';
1523 1.1 jtc bracket[3] = '\0';
1524 1.1 jtc p_bracket(p);
1525 1.1 jtc assert(p->next == bracket+3);
1526 1.1 jtc p->next = oldnext;
1527 1.1 jtc p->end = oldend;
1528 1.1 jtc }
1529 1.1 jtc
1530 1.1 jtc /*
1531 1.1 jtc - repeat - generate code for a bounded repetition, recursively if needed
1532 1.39 christos == static void repeat(struct parse *p, sopno start, int from, int to);
1533 1.1 jtc */
1534 1.1 jtc static void
1535 1.39 christos repeat(struct parse *p,
1536 1.39 christos sopno start, /* operand from here to end of strip */
1537 1.39 christos int from, /* repeated from this number */
1538 1.39 christos int to) /* to this number of times (maybe INFINITY) */
1539 1.1 jtc {
1540 1.39 christos sopno finish = HERE();
1541 1.1 jtc # define N 2
1542 1.1 jtc # define INF 3
1543 1.1 jtc # define REP(f, t) ((f)*8 + (t))
1544 1.1 jtc # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1545 1.9 perry sopno copy;
1546 1.1 jtc
1547 1.14 lukem _DIAGASSERT(p != NULL);
1548 1.14 lukem
1549 1.39 christos if (p->error != 0) /* head off possible runaway recursion */
1550 1.30 christos return;
1551 1.30 christos
1552 1.1 jtc assert(from <= to);
1553 1.1 jtc
1554 1.1 jtc switch (REP(MAP(from), MAP(to))) {
1555 1.1 jtc case REP(0, 0): /* must be user doing this */
1556 1.1 jtc DROP(finish-start); /* drop the operand */
1557 1.1 jtc break;
1558 1.1 jtc case REP(0, 1): /* as x{1,1}? */
1559 1.1 jtc case REP(0, N): /* as x{1,n}? */
1560 1.1 jtc case REP(0, INF): /* as x{1,}? */
1561 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1562 1.4 jtc INSERT(OCH_, start); /* offset is wrong... */
1563 1.39 christos repeat(p, start+1, 1, to);
1564 1.4 jtc ASTERN(OOR1, start);
1565 1.1 jtc AHEAD(start); /* ... fix it */
1566 1.4 jtc EMIT(OOR2, 0);
1567 1.4 jtc AHEAD(THERE());
1568 1.4 jtc ASTERN(O_CH, THERETHERE());
1569 1.1 jtc break;
1570 1.1 jtc case REP(1, 1): /* trivial case */
1571 1.1 jtc /* done */
1572 1.1 jtc break;
1573 1.1 jtc case REP(1, N): /* as x?x{1,n-1} */
1574 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1575 1.4 jtc INSERT(OCH_, start);
1576 1.4 jtc ASTERN(OOR1, start);
1577 1.4 jtc AHEAD(start);
1578 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
1579 1.4 jtc AHEAD(THERE()); /* ...so fix it */
1580 1.4 jtc ASTERN(O_CH, THERETHERE());
1581 1.1 jtc copy = dupl(p, start+1, finish+1);
1582 1.4 jtc assert(copy == finish+4);
1583 1.39 christos repeat(p, copy, 1, to-1);
1584 1.1 jtc break;
1585 1.1 jtc case REP(1, INF): /* as x+ */
1586 1.1 jtc INSERT(OPLUS_, start);
1587 1.1 jtc ASTERN(O_PLUS, start);
1588 1.1 jtc break;
1589 1.1 jtc case REP(N, N): /* as xx{m-1,n-1} */
1590 1.1 jtc copy = dupl(p, start, finish);
1591 1.39 christos repeat(p, copy, from-1, to-1);
1592 1.1 jtc break;
1593 1.1 jtc case REP(N, INF): /* as xx{n-1,INF} */
1594 1.1 jtc copy = dupl(p, start, finish);
1595 1.39 christos repeat(p, copy, from-1, to);
1596 1.1 jtc break;
1597 1.1 jtc default: /* "can't happen" */
1598 1.1 jtc SETERROR(REG_ASSERT); /* just in case */
1599 1.1 jtc break;
1600 1.1 jtc }
1601 1.1 jtc }
1602 1.1 jtc
1603 1.1 jtc /*
1604 1.39 christos - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1605 1.39 christos - character from the parse struct, signals a REG_ILLSEQ error if the
1606 1.39 christos - character can't be converted. Returns the number of bytes consumed.
1607 1.39 christos */
1608 1.39 christos static wint_t
1609 1.39 christos wgetnext(struct parse *p)
1610 1.39 christos {
1611 1.42 christos #ifdef NLS
1612 1.39 christos mbstate_t mbs;
1613 1.39 christos wchar_t wc;
1614 1.39 christos size_t n;
1615 1.39 christos
1616 1.39 christos memset(&mbs, 0, sizeof(mbs));
1617 1.40 christos n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1618 1.39 christos if (n == (size_t)-1 || n == (size_t)-2) {
1619 1.39 christos SETERROR(REG_ILLSEQ);
1620 1.39 christos return (0);
1621 1.39 christos }
1622 1.39 christos if (n == 0)
1623 1.39 christos n = 1;
1624 1.39 christos p->next += n;
1625 1.42 christos return wc;
1626 1.42 christos #else
1627 1.42 christos return *p->next++;
1628 1.42 christos #endif
1629 1.39 christos }
1630 1.39 christos
1631 1.39 christos /*
1632 1.1 jtc - seterr - set an error condition
1633 1.9 perry == static int seterr(struct parse *p, int e);
1634 1.1 jtc */
1635 1.1 jtc static int /* useless but makes type checking happy */
1636 1.39 christos seterr(struct parse *p, int e)
1637 1.1 jtc {
1638 1.14 lukem
1639 1.14 lukem _DIAGASSERT(p != NULL);
1640 1.14 lukem
1641 1.1 jtc if (p->error == 0) /* keep earliest error condition */
1642 1.1 jtc p->error = e;
1643 1.1 jtc p->next = nuls; /* try to bring things to a halt */
1644 1.1 jtc p->end = nuls;
1645 1.1 jtc return(0); /* make the return value well-defined */
1646 1.1 jtc }
1647 1.1 jtc
1648 1.1 jtc /*
1649 1.1 jtc - allocset - allocate a set of characters for []
1650 1.9 perry == static cset *allocset(struct parse *p);
1651 1.1 jtc */
1652 1.1 jtc static cset *
1653 1.39 christos allocset(struct parse *p)
1654 1.1 jtc {
1655 1.39 christos cset *cs, *ncs;
1656 1.1 jtc
1657 1.14 lukem _DIAGASSERT(p != NULL);
1658 1.14 lukem
1659 1.39 christos ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1660 1.39 christos if (ncs == NULL) {
1661 1.39 christos SETERROR(REG_ESPACE);
1662 1.39 christos return (NULL);
1663 1.1 jtc }
1664 1.39 christos p->g->sets = ncs;
1665 1.39 christos cs = &p->g->sets[p->g->ncsets++];
1666 1.39 christos memset(cs, 0, sizeof(*cs));
1667 1.1 jtc
1668 1.1 jtc return(cs);
1669 1.1 jtc }
1670 1.1 jtc
1671 1.1 jtc /*
1672 1.1 jtc - freeset - free a now-unused set
1673 1.9 perry == static void freeset(struct parse *p, cset *cs);
1674 1.1 jtc */
1675 1.1 jtc static void
1676 1.39 christos freeset(struct parse *p, cset *cs)
1677 1.1 jtc {
1678 1.14 lukem cset *top;
1679 1.14 lukem
1680 1.14 lukem _DIAGASSERT(p != NULL);
1681 1.14 lukem _DIAGASSERT(cs != NULL);
1682 1.14 lukem
1683 1.14 lukem top = &p->g->sets[p->g->ncsets];
1684 1.1 jtc
1685 1.39 christos free(cs->wides);
1686 1.39 christos free(cs->ranges);
1687 1.39 christos free(cs->types);
1688 1.39 christos memset(cs, 0, sizeof(*cs));
1689 1.1 jtc if (cs == top-1) /* recover only the easy case */
1690 1.1 jtc p->g->ncsets--;
1691 1.1 jtc }
1692 1.1 jtc
1693 1.1 jtc /*
1694 1.39 christos - singleton - Determine whether a set contains only one character,
1695 1.39 christos - returning it if so, otherwise returning OUT.
1696 1.1 jtc */
1697 1.39 christos static wint_t
1698 1.39 christos singleton(cset *cs)
1699 1.1 jtc {
1700 1.39 christos wint_t i, s, n;
1701 1.1 jtc
1702 1.39 christos for (i = n = 0; i < NC; i++)
1703 1.39 christos if (CHIN(cs, i)) {
1704 1.1 jtc n++;
1705 1.39 christos s = i;
1706 1.39 christos }
1707 1.39 christos if (n == 1)
1708 1.39 christos return (s);
1709 1.39 christos if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1710 1.39 christos cs->icase == 0)
1711 1.39 christos return (cs->wides[0]);
1712 1.39 christos /* Don't bother handling the other cases. */
1713 1.39 christos return (OUT);
1714 1.1 jtc }
1715 1.1 jtc
1716 1.1 jtc /*
1717 1.39 christos - CHadd - add character to character set.
1718 1.1 jtc */
1719 1.1 jtc static void
1720 1.39 christos CHadd(struct parse *p, cset *cs, wint_t ch)
1721 1.1 jtc {
1722 1.39 christos wint_t nch, *newwides;
1723 1.14 lukem
1724 1.14 lukem _DIAGASSERT(p != NULL);
1725 1.14 lukem _DIAGASSERT(cs != NULL);
1726 1.14 lukem
1727 1.39 christos assert(ch >= 0);
1728 1.39 christos if (ch < NC)
1729 1.40 christos cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1730 1.39 christos else {
1731 1.39 christos newwides = reallocarray(cs->wides, cs->nwides + 1,
1732 1.39 christos sizeof(*cs->wides));
1733 1.39 christos if (newwides == NULL) {
1734 1.39 christos SETERROR(REG_ESPACE);
1735 1.39 christos return;
1736 1.39 christos }
1737 1.39 christos cs->wides = newwides;
1738 1.39 christos cs->wides[cs->nwides++] = ch;
1739 1.39 christos }
1740 1.39 christos if (cs->icase) {
1741 1.39 christos if ((nch = towlower(ch)) < NC)
1742 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1743 1.39 christos if ((nch = towupper(ch)) < NC)
1744 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1745 1.1 jtc }
1746 1.1 jtc }
1747 1.1 jtc
1748 1.1 jtc /*
1749 1.39 christos - CHaddrange - add all characters in the range [min,max] to a character set.
1750 1.1 jtc */
1751 1.1 jtc static void
1752 1.39 christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1753 1.1 jtc {
1754 1.39 christos crange *newranges;
1755 1.14 lukem
1756 1.39 christos _DIAGASSERT(p != NULL);
1757 1.14 lukem _DIAGASSERT(cs != NULL);
1758 1.14 lukem
1759 1.39 christos for (; min < NC && min <= max; min++)
1760 1.39 christos CHadd(p, cs, min);
1761 1.39 christos if (min >= max)
1762 1.39 christos return;
1763 1.39 christos newranges = reallocarray(cs->ranges, cs->nranges + 1,
1764 1.39 christos sizeof(*cs->ranges));
1765 1.39 christos if (newranges == NULL) {
1766 1.39 christos SETERROR(REG_ESPACE);
1767 1.1 jtc return;
1768 1.1 jtc }
1769 1.39 christos cs->ranges = newranges;
1770 1.39 christos cs->ranges[cs->nranges].min = min;
1771 1.39 christos cs->ranges[cs->nranges].max = max;
1772 1.39 christos cs->nranges++;
1773 1.1 jtc }
1774 1.1 jtc
1775 1.1 jtc /*
1776 1.39 christos - CHaddtype - add all characters of a certain type to a character set.
1777 1.1 jtc */
1778 1.1 jtc static void
1779 1.39 christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1780 1.1 jtc {
1781 1.39 christos wint_t i;
1782 1.39 christos wctype_t *newtypes;
1783 1.14 lukem
1784 1.14 lukem _DIAGASSERT(p != NULL);
1785 1.14 lukem _DIAGASSERT(cs != NULL);
1786 1.14 lukem
1787 1.39 christos for (i = 0; i < NC; i++)
1788 1.39 christos if (iswctype(i, wct))
1789 1.39 christos CHadd(p, cs, i);
1790 1.39 christos newtypes = reallocarray(cs->types, cs->ntypes + 1,
1791 1.39 christos sizeof(*cs->types));
1792 1.39 christos if (newtypes == NULL) {
1793 1.39 christos SETERROR(REG_ESPACE);
1794 1.1 jtc return;
1795 1.39 christos }
1796 1.39 christos cs->types = newtypes;
1797 1.39 christos cs->types[cs->ntypes++] = wct;
1798 1.1 jtc }
1799 1.1 jtc
1800 1.1 jtc /*
1801 1.1 jtc - dupl - emit a duplicate of a bunch of sops
1802 1.9 perry == static sopno dupl(struct parse *p, sopno start, sopno finish);
1803 1.1 jtc */
1804 1.1 jtc static sopno /* start of duplicate */
1805 1.39 christos dupl(struct parse *p,
1806 1.39 christos sopno start, /* from here */
1807 1.39 christos sopno finish) /* to this less one */
1808 1.1 jtc {
1809 1.39 christos sopno ret = HERE();
1810 1.9 perry sopno len = finish - start;
1811 1.1 jtc
1812 1.14 lukem _DIAGASSERT(p != NULL);
1813 1.14 lukem
1814 1.1 jtc assert(finish >= start);
1815 1.1 jtc if (len == 0)
1816 1.1 jtc return(ret);
1817 1.39 christos if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1818 1.39 christos return(ret);
1819 1.40 christos (void) memcpy(p->strip + p->slen,
1820 1.40 christos p->strip + start, len * sizeof(*p->strip));
1821 1.1 jtc p->slen += len;
1822 1.1 jtc return(ret);
1823 1.1 jtc }
1824 1.1 jtc
1825 1.1 jtc /*
1826 1.1 jtc - doemit - emit a strip operator
1827 1.9 perry == static void doemit(struct parse *p, sop op, size_t opnd);
1828 1.1 jtc *
1829 1.1 jtc * It might seem better to implement this as a macro with a function as
1830 1.1 jtc * hard-case backup, but it's just too big and messy unless there are
1831 1.1 jtc * some changes to the data structures. Maybe later.
1832 1.1 jtc */
1833 1.1 jtc static void
1834 1.39 christos doemit(struct parse *p, sop op, size_t opnd)
1835 1.1 jtc {
1836 1.1 jtc /* avoid making error situations worse */
1837 1.1 jtc if (p->error != 0)
1838 1.1 jtc return;
1839 1.1 jtc
1840 1.39 christos _DIAGASSERT(p != NULL);
1841 1.39 christos
1842 1.1 jtc /* deal with oversize operands ("can't happen", more or less) */
1843 1.1 jtc assert(opnd < 1<<OPSHIFT);
1844 1.1 jtc
1845 1.1 jtc /* deal with undersized strip */
1846 1.1 jtc if (p->slen >= p->ssize)
1847 1.30 christos if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1848 1.30 christos return;
1849 1.1 jtc
1850 1.1 jtc /* finally, it's all reduced to the easy case */
1851 1.40 christos p->strip[p->slen++] = (sopno)SOP(op, opnd);
1852 1.1 jtc }
1853 1.1 jtc
1854 1.1 jtc /*
1855 1.1 jtc - doinsert - insert a sop into the strip
1856 1.9 perry == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1857 1.1 jtc */
1858 1.1 jtc static void
1859 1.39 christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1860 1.1 jtc {
1861 1.9 perry sopno sn;
1862 1.9 perry sop s;
1863 1.9 perry int i;
1864 1.1 jtc
1865 1.14 lukem _DIAGASSERT(p != NULL);
1866 1.14 lukem
1867 1.1 jtc /* avoid making error situations worse */
1868 1.1 jtc if (p->error != 0)
1869 1.1 jtc return;
1870 1.1 jtc
1871 1.1 jtc sn = HERE();
1872 1.1 jtc EMIT(op, opnd); /* do checks, ensure space */
1873 1.1 jtc assert(HERE() == sn+1);
1874 1.1 jtc s = p->strip[sn];
1875 1.1 jtc
1876 1.1 jtc /* adjust paren pointers */
1877 1.1 jtc assert(pos > 0);
1878 1.1 jtc for (i = 1; i < NPAREN; i++) {
1879 1.1 jtc if (p->pbegin[i] >= pos) {
1880 1.1 jtc p->pbegin[i]++;
1881 1.1 jtc }
1882 1.1 jtc if (p->pend[i] >= pos) {
1883 1.1 jtc p->pend[i]++;
1884 1.1 jtc }
1885 1.1 jtc }
1886 1.1 jtc
1887 1.40 christos memmove(&p->strip[pos+1], &p->strip[pos],
1888 1.40 christos (HERE()-pos-1)*sizeof(*p->strip));
1889 1.1 jtc p->strip[pos] = s;
1890 1.1 jtc }
1891 1.1 jtc
1892 1.1 jtc /*
1893 1.1 jtc - dofwd - complete a forward reference
1894 1.9 perry == static void dofwd(struct parse *p, sopno pos, sop value);
1895 1.1 jtc */
1896 1.1 jtc static void
1897 1.39 christos dofwd(struct parse *p, sopno pos, sop value)
1898 1.1 jtc {
1899 1.14 lukem
1900 1.14 lukem _DIAGASSERT(p != NULL);
1901 1.14 lukem
1902 1.1 jtc /* avoid making error situations worse */
1903 1.1 jtc if (p->error != 0)
1904 1.1 jtc return;
1905 1.1 jtc
1906 1.1 jtc assert(value < 1<<OPSHIFT);
1907 1.39 christos p->strip[pos] = OP(p->strip[pos]) | value;
1908 1.1 jtc }
1909 1.1 jtc
1910 1.1 jtc /*
1911 1.1 jtc - enlarge - enlarge the strip
1912 1.39 christos == static int enlarge(struct parse *p, sopno size);
1913 1.1 jtc */
1914 1.30 christos static int
1915 1.35 joerg enlarge(struct parse *p, sopno size)
1916 1.1 jtc {
1917 1.39 christos sop *sp;
1918 1.39 christos
1919 1.14 lukem _DIAGASSERT(p != NULL);
1920 1.14 lukem
1921 1.1 jtc if (p->ssize >= size)
1922 1.30 christos return 1;
1923 1.1 jtc
1924 1.40 christos sp = reallocarray(p->strip, size, sizeof(*p->strip));
1925 1.39 christos if (sp == NULL) {
1926 1.1 jtc SETERROR(REG_ESPACE);
1927 1.30 christos return 0;
1928 1.1 jtc }
1929 1.39 christos p->strip = sp;
1930 1.35 joerg p->ssize = size;
1931 1.30 christos return 1;
1932 1.1 jtc }
1933 1.1 jtc
1934 1.1 jtc /*
1935 1.1 jtc - stripsnug - compact the strip
1936 1.9 perry == static void stripsnug(struct parse *p, struct re_guts *g);
1937 1.1 jtc */
1938 1.1 jtc static void
1939 1.39 christos stripsnug(struct parse *p, struct re_guts *g)
1940 1.1 jtc {
1941 1.14 lukem
1942 1.14 lukem _DIAGASSERT(p != NULL);
1943 1.14 lukem _DIAGASSERT(g != NULL);
1944 1.14 lukem
1945 1.1 jtc g->nstates = p->slen;
1946 1.40 christos g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1947 1.39 christos if (g->strip == NULL) {
1948 1.39 christos SETERROR(REG_ESPACE);
1949 1.39 christos g->strip = p->strip;
1950 1.39 christos }
1951 1.1 jtc }
1952 1.1 jtc
1953 1.1 jtc /*
1954 1.1 jtc - findmust - fill in must and mlen with longest mandatory literal string
1955 1.9 perry == static void findmust(struct parse *p, struct re_guts *g);
1956 1.1 jtc *
1957 1.1 jtc * This algorithm could do fancy things like analyzing the operands of |
1958 1.1 jtc * for common subsequences. Someday. This code is simple and finds most
1959 1.1 jtc * of the interesting cases.
1960 1.1 jtc *
1961 1.1 jtc * Note that must and mlen got initialized during setup.
1962 1.1 jtc */
1963 1.1 jtc static void
1964 1.39 christos findmust(struct parse *p, struct re_guts *g)
1965 1.1 jtc {
1966 1.9 perry sop *scan;
1967 1.7 christos sop *start = NULL;
1968 1.9 perry sop *newstart = NULL;
1969 1.9 perry sopno newlen;
1970 1.9 perry sop s;
1971 1.9 perry char *cp;
1972 1.39 christos int offset;
1973 1.39 christos mbstate_t mbs;
1974 1.1 jtc
1975 1.14 lukem _DIAGASSERT(p != NULL);
1976 1.14 lukem _DIAGASSERT(g != NULL);
1977 1.14 lukem
1978 1.1 jtc /* avoid making error situations worse */
1979 1.1 jtc if (p->error != 0)
1980 1.1 jtc return;
1981 1.1 jtc
1982 1.39 christos #ifdef notyet
1983 1.39 christos /*
1984 1.39 christos * It's not generally safe to do a ``char'' substring search on
1985 1.39 christos * multibyte character strings, but it's safe for at least
1986 1.39 christos * UTF-8 (see RFC 3629).
1987 1.39 christos */
1988 1.39 christos if (MB_CUR_MAX > 1 &&
1989 1.39 christos strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1990 1.39 christos return;
1991 1.39 christos #endif
1992 1.39 christos
1993 1.1 jtc /* find the longest OCHAR sequence in strip */
1994 1.1 jtc newlen = 0;
1995 1.39 christos offset = 0;
1996 1.39 christos g->moffset = 0;
1997 1.1 jtc scan = g->strip + 1;
1998 1.1 jtc do {
1999 1.1 jtc s = *scan++;
2000 1.1 jtc switch (OP(s)) {
2001 1.1 jtc case OCHAR: /* sequence member */
2002 1.39 christos if (newlen == 0) { /* new sequence */
2003 1.39 christos memset(&mbs, 0, sizeof(mbs));
2004 1.1 jtc newstart = scan - 1;
2005 1.39 christos }
2006 1.42 christos #ifdef NLS
2007 1.42 christos char buf[MB_LEN_MAX];
2008 1.42 christos size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2009 1.39 christos if (clen == (size_t)-1)
2010 1.39 christos goto toohard;
2011 1.40 christos newlen += (sopno)clen;
2012 1.42 christos #else
2013 1.42 christos newlen++;
2014 1.42 christos #endif
2015 1.1 jtc break;
2016 1.1 jtc case OPLUS_: /* things that don't break one */
2017 1.1 jtc case OLPAREN:
2018 1.1 jtc case ORPAREN:
2019 1.1 jtc break;
2020 1.1 jtc case OQUEST_: /* things that must be skipped */
2021 1.1 jtc case OCH_:
2022 1.39 christos offset = altoffset(scan, offset);
2023 1.1 jtc scan--;
2024 1.1 jtc do {
2025 1.1 jtc scan += OPND(s);
2026 1.1 jtc s = *scan;
2027 1.1 jtc /* assert() interferes w debug printouts */
2028 1.40 christos if (OP(s) != O_QUEST &&
2029 1.40 christos OP(s) != O_CH && OP(s) != OOR2) {
2030 1.1 jtc g->iflags |= BAD;
2031 1.1 jtc return;
2032 1.1 jtc }
2033 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2034 1.11 christos /* FALLTHROUGH */
2035 1.39 christos case OBOW: /* things that break a sequence */
2036 1.39 christos case OEOW:
2037 1.39 christos case OBOL:
2038 1.39 christos case OEOL:
2039 1.39 christos case OBOS:
2040 1.39 christos case OEOS:
2041 1.39 christos case OWBND:
2042 1.39 christos case ONWBND:
2043 1.39 christos case O_QUEST:
2044 1.39 christos case O_CH:
2045 1.39 christos case OEND:
2046 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2047 1.1 jtc start = newstart;
2048 1.1 jtc g->mlen = newlen;
2049 1.39 christos if (offset > -1) {
2050 1.39 christos g->moffset += offset;
2051 1.39 christos offset = newlen;
2052 1.39 christos } else
2053 1.39 christos g->moffset = offset;
2054 1.39 christos } else {
2055 1.39 christos if (offset > -1)
2056 1.39 christos offset += newlen;
2057 1.1 jtc }
2058 1.1 jtc newlen = 0;
2059 1.1 jtc break;
2060 1.39 christos case OANY:
2061 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2062 1.39 christos start = newstart;
2063 1.39 christos g->mlen = newlen;
2064 1.39 christos if (offset > -1) {
2065 1.39 christos g->moffset += offset;
2066 1.39 christos offset = newlen;
2067 1.39 christos } else
2068 1.39 christos g->moffset = offset;
2069 1.39 christos } else {
2070 1.39 christos if (offset > -1)
2071 1.39 christos offset += newlen;
2072 1.39 christos }
2073 1.39 christos if (offset > -1)
2074 1.39 christos offset++;
2075 1.39 christos newlen = 0;
2076 1.39 christos break;
2077 1.39 christos case OANYOF: /* may or may not invalidate offset */
2078 1.39 christos /* First, everything as OANY */
2079 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2080 1.39 christos start = newstart;
2081 1.39 christos g->mlen = newlen;
2082 1.39 christos if (offset > -1) {
2083 1.39 christos g->moffset += offset;
2084 1.39 christos offset = newlen;
2085 1.39 christos } else
2086 1.39 christos g->moffset = offset;
2087 1.39 christos } else {
2088 1.39 christos if (offset > -1)
2089 1.39 christos offset += newlen;
2090 1.39 christos }
2091 1.39 christos if (offset > -1)
2092 1.39 christos offset++;
2093 1.39 christos newlen = 0;
2094 1.39 christos break;
2095 1.42 christos #ifdef NLS
2096 1.40 christos toohard:/*FALLTHROUGH*/
2097 1.42 christos #endif
2098 1.39 christos default:
2099 1.39 christos /* Anything here makes it impossible or too hard
2100 1.39 christos * to calculate the offset -- so we give up;
2101 1.39 christos * save the last known good offset, in case the
2102 1.39 christos * must sequence doesn't occur later.
2103 1.39 christos */
2104 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2105 1.39 christos start = newstart;
2106 1.39 christos g->mlen = newlen;
2107 1.39 christos if (offset > -1)
2108 1.39 christos g->moffset += offset;
2109 1.39 christos else
2110 1.39 christos g->moffset = offset;
2111 1.39 christos }
2112 1.39 christos offset = -1;
2113 1.39 christos newlen = 0;
2114 1.39 christos break;
2115 1.1 jtc }
2116 1.1 jtc } while (OP(s) != OEND);
2117 1.1 jtc
2118 1.39 christos if (g->mlen == 0) { /* there isn't one */
2119 1.39 christos g->moffset = -1;
2120 1.1 jtc return;
2121 1.39 christos }
2122 1.1 jtc
2123 1.1 jtc /* turn it into a character string */
2124 1.1 jtc g->must = malloc((size_t)g->mlen + 1);
2125 1.1 jtc if (g->must == NULL) { /* argh; just forget it */
2126 1.1 jtc g->mlen = 0;
2127 1.39 christos g->moffset = -1;
2128 1.1 jtc return;
2129 1.1 jtc }
2130 1.1 jtc cp = g->must;
2131 1.1 jtc scan = start;
2132 1.39 christos memset(&mbs, 0, sizeof(mbs));
2133 1.39 christos while (cp < g->must + g->mlen) {
2134 1.1 jtc while (OP(s = *scan++) != OCHAR)
2135 1.1 jtc continue;
2136 1.42 christos #ifdef NLS
2137 1.43 christos size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2138 1.39 christos assert(clen != (size_t)-1);
2139 1.39 christos cp += clen;
2140 1.42 christos #else
2141 1.42 christos *cp++ = OPND(s);
2142 1.42 christos #endif
2143 1.1 jtc }
2144 1.2 jtc assert(cp == g->must + g->mlen);
2145 1.1 jtc *cp++ = '\0'; /* just on general principles */
2146 1.1 jtc }
2147 1.1 jtc
2148 1.1 jtc /*
2149 1.39 christos - altoffset - choose biggest offset among multiple choices
2150 1.39 christos == static int altoffset(sop *scan, int offset);
2151 1.39 christos *
2152 1.39 christos * Compute, recursively if necessary, the largest offset among multiple
2153 1.39 christos * re paths.
2154 1.39 christos */
2155 1.39 christos static int
2156 1.39 christos altoffset(sop *scan, int offset)
2157 1.39 christos {
2158 1.39 christos int largest;
2159 1.39 christos int try;
2160 1.39 christos sop s;
2161 1.39 christos
2162 1.39 christos _DIAGASSERT(scan != NULL);
2163 1.39 christos
2164 1.39 christos /* If we gave up already on offsets, return */
2165 1.39 christos if (offset == -1)
2166 1.39 christos return -1;
2167 1.39 christos
2168 1.39 christos largest = 0;
2169 1.39 christos try = 0;
2170 1.39 christos s = *scan++;
2171 1.40 christos while (OP(s) != O_QUEST && OP(s) != O_CH) {
2172 1.39 christos switch (OP(s)) {
2173 1.39 christos case OOR1:
2174 1.39 christos if (try > largest)
2175 1.39 christos largest = try;
2176 1.39 christos try = 0;
2177 1.39 christos break;
2178 1.39 christos case OQUEST_:
2179 1.39 christos case OCH_:
2180 1.39 christos try = altoffset(scan, try);
2181 1.39 christos if (try == -1)
2182 1.39 christos return -1;
2183 1.39 christos scan--;
2184 1.39 christos do {
2185 1.39 christos scan += OPND(s);
2186 1.39 christos s = *scan;
2187 1.40 christos if (OP(s) != O_QUEST &&
2188 1.40 christos OP(s) != O_CH && OP(s) != OOR2)
2189 1.39 christos return -1;
2190 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2191 1.39 christos /* We must skip to the next position, or we'll
2192 1.39 christos * leave altoffset() too early.
2193 1.39 christos */
2194 1.39 christos scan++;
2195 1.39 christos break;
2196 1.39 christos case OANYOF:
2197 1.39 christos case OCHAR:
2198 1.39 christos case OANY:
2199 1.39 christos try++;
2200 1.40 christos /*FALLTHROUGH*/
2201 1.39 christos case OBOW:
2202 1.39 christos case OEOW:
2203 1.39 christos case OWBND:
2204 1.39 christos case ONWBND:
2205 1.39 christos case OLPAREN:
2206 1.39 christos case ORPAREN:
2207 1.39 christos case OOR2:
2208 1.39 christos break;
2209 1.39 christos default:
2210 1.39 christos try = -1;
2211 1.39 christos break;
2212 1.39 christos }
2213 1.39 christos if (try == -1)
2214 1.39 christos return -1;
2215 1.39 christos s = *scan++;
2216 1.39 christos }
2217 1.39 christos
2218 1.39 christos if (try > largest)
2219 1.39 christos largest = try;
2220 1.39 christos
2221 1.39 christos return largest+offset;
2222 1.39 christos }
2223 1.39 christos
2224 1.39 christos /*
2225 1.39 christos - computejumps - compute char jumps for BM scan
2226 1.39 christos == static void computejumps(struct parse *p, struct re_guts *g);
2227 1.39 christos *
2228 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2229 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2230 1.39 christos * Sara Baase.
2231 1.39 christos *
2232 1.39 christos * A char jump is the number of characters one needs to jump based on
2233 1.39 christos * the value of the character from the text that was mismatched.
2234 1.39 christos */
2235 1.39 christos static void
2236 1.39 christos computejumps(struct parse *p, struct re_guts *g)
2237 1.39 christos {
2238 1.39 christos int ch;
2239 1.39 christos size_t mindex;
2240 1.39 christos
2241 1.39 christos _DIAGASSERT(p != NULL);
2242 1.39 christos _DIAGASSERT(g != NULL);
2243 1.39 christos
2244 1.39 christos /* Avoid making errors worse */
2245 1.39 christos if (p->error != 0)
2246 1.39 christos return;
2247 1.39 christos
2248 1.39 christos g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2249 1.39 christos if (g->charjump == NULL) /* Not a fatal error */
2250 1.39 christos return;
2251 1.39 christos /* Adjust for signed chars, if necessary */
2252 1.39 christos g->charjump = &g->charjump[-(CHAR_MIN)];
2253 1.39 christos
2254 1.39 christos /* If the character does not exist in the pattern, the jump
2255 1.39 christos * is equal to the number of characters in the pattern.
2256 1.39 christos */
2257 1.39 christos for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2258 1.39 christos g->charjump[ch] = g->mlen;
2259 1.39 christos
2260 1.39 christos /* If the character does exist, compute the jump that would
2261 1.39 christos * take us to the last character in the pattern equal to it
2262 1.39 christos * (notice that we match right to left, so that last character
2263 1.39 christos * is the first one that would be matched).
2264 1.39 christos */
2265 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2266 1.39 christos g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2267 1.39 christos }
2268 1.39 christos
2269 1.39 christos /*
2270 1.39 christos - computematchjumps - compute match jumps for BM scan
2271 1.39 christos == static void computematchjumps(struct parse *p, struct re_guts *g);
2272 1.39 christos *
2273 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2274 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2275 1.39 christos * Sara Baase.
2276 1.39 christos *
2277 1.39 christos * A match jump is the number of characters one needs to advance based
2278 1.39 christos * on the already-matched suffix.
2279 1.39 christos * Notice that all values here are minus (g->mlen-1), because of the way
2280 1.39 christos * the search algorithm works.
2281 1.39 christos */
2282 1.39 christos static void
2283 1.39 christos computematchjumps(struct parse *p, struct re_guts *g)
2284 1.39 christos {
2285 1.39 christos size_t mindex; /* General "must" iterator */
2286 1.39 christos size_t suffix; /* Keeps track of matching suffix */
2287 1.39 christos size_t ssuffix; /* Keeps track of suffixes' suffix */
2288 1.39 christos size_t* pmatches; /* pmatches[k] points to the next i
2289 1.39 christos * such that i+1...mlen is a substring
2290 1.39 christos * of k+1...k+mlen-i-1
2291 1.39 christos */
2292 1.39 christos
2293 1.39 christos _DIAGASSERT(p != NULL);
2294 1.39 christos _DIAGASSERT(g != NULL);
2295 1.39 christos
2296 1.39 christos /* Avoid making errors worse */
2297 1.39 christos if (p->error != 0)
2298 1.39 christos return;
2299 1.39 christos
2300 1.39 christos pmatches = calloc(g->mlen, sizeof(*pmatches));
2301 1.39 christos if (pmatches == NULL) {
2302 1.39 christos g->matchjump = NULL;
2303 1.39 christos return;
2304 1.39 christos }
2305 1.39 christos
2306 1.39 christos g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2307 1.39 christos if (g->matchjump == NULL) { /* Not a fatal error */
2308 1.39 christos free(pmatches);
2309 1.39 christos return;
2310 1.39 christos }
2311 1.39 christos
2312 1.39 christos /* Set maximum possible jump for each character in the pattern */
2313 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2314 1.39 christos g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2315 1.39 christos
2316 1.39 christos /* Compute pmatches[] */
2317 1.39 christos for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2318 1.39 christos pmatches[mindex] = suffix;
2319 1.39 christos
2320 1.39 christos /* If a mismatch is found, interrupting the substring,
2321 1.39 christos * compute the matchjump for that position. If no
2322 1.39 christos * mismatch is found, then a text substring mismatched
2323 1.39 christos * against the suffix will also mismatch against the
2324 1.39 christos * substring.
2325 1.39 christos */
2326 1.39 christos while (suffix < g->mlen
2327 1.39 christos && g->must[mindex] != g->must[suffix]) {
2328 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2329 1.39 christos g->mlen - mindex - 1);
2330 1.39 christos suffix = pmatches[suffix];
2331 1.39 christos }
2332 1.39 christos }
2333 1.39 christos
2334 1.39 christos /* Compute the matchjump up to the last substring found to jump
2335 1.39 christos * to the beginning of the largest must pattern prefix matching
2336 1.39 christos * it's own suffix.
2337 1.39 christos */
2338 1.39 christos for (mindex = 0; mindex <= suffix; mindex++)
2339 1.39 christos g->matchjump[mindex] = MIN(g->matchjump[mindex],
2340 1.39 christos g->mlen + suffix - mindex);
2341 1.39 christos
2342 1.39 christos ssuffix = pmatches[suffix];
2343 1.39 christos while (suffix < g->mlen) {
2344 1.39 christos while (suffix <= ssuffix && suffix < g->mlen) {
2345 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2346 1.39 christos g->mlen + ssuffix - suffix);
2347 1.39 christos suffix++;
2348 1.39 christos }
2349 1.39 christos if (suffix < g->mlen)
2350 1.39 christos ssuffix = pmatches[ssuffix];
2351 1.39 christos }
2352 1.39 christos
2353 1.39 christos free(pmatches);
2354 1.39 christos }
2355 1.39 christos
2356 1.39 christos /*
2357 1.1 jtc - pluscount - count + nesting
2358 1.9 perry == static sopno pluscount(struct parse *p, struct re_guts *g);
2359 1.1 jtc */
2360 1.1 jtc static sopno /* nesting depth */
2361 1.39 christos pluscount(struct parse *p, struct re_guts *g)
2362 1.1 jtc {
2363 1.9 perry sop *scan;
2364 1.9 perry sop s;
2365 1.9 perry sopno plusnest = 0;
2366 1.9 perry sopno maxnest = 0;
2367 1.14 lukem
2368 1.14 lukem _DIAGASSERT(p != NULL);
2369 1.14 lukem _DIAGASSERT(g != NULL);
2370 1.1 jtc
2371 1.1 jtc if (p->error != 0)
2372 1.1 jtc return(0); /* there may not be an OEND */
2373 1.1 jtc
2374 1.1 jtc scan = g->strip + 1;
2375 1.1 jtc do {
2376 1.1 jtc s = *scan++;
2377 1.1 jtc switch (OP(s)) {
2378 1.1 jtc case OPLUS_:
2379 1.1 jtc plusnest++;
2380 1.1 jtc break;
2381 1.1 jtc case O_PLUS:
2382 1.1 jtc if (plusnest > maxnest)
2383 1.1 jtc maxnest = plusnest;
2384 1.1 jtc plusnest--;
2385 1.1 jtc break;
2386 1.1 jtc }
2387 1.1 jtc } while (OP(s) != OEND);
2388 1.1 jtc if (plusnest != 0)
2389 1.1 jtc g->iflags |= BAD;
2390 1.1 jtc return(maxnest);
2391 1.1 jtc }
2392