regcomp.c revision 1.46 1 1.46 christos /* $NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $ */
2 1.6 cgd
3 1.5 cgd /*-
4 1.39 christos * SPDX-License-Identifier: BSD-3-Clause
5 1.39 christos *
6 1.39 christos * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 1.5 cgd * Copyright (c) 1992, 1993, 1994
8 1.5 cgd * The Regents of the University of California. All rights reserved.
9 1.5 cgd *
10 1.39 christos * Copyright (c) 2011 The FreeBSD Foundation
11 1.39 christos * All rights reserved.
12 1.39 christos * Portions of this software were developed by David Chisnall
13 1.39 christos * under sponsorship from the FreeBSD Foundation.
14 1.39 christos *
15 1.5 cgd * This code is derived from software contributed to Berkeley by
16 1.5 cgd * Henry Spencer.
17 1.5 cgd *
18 1.5 cgd * Redistribution and use in source and binary forms, with or without
19 1.5 cgd * modification, are permitted provided that the following conditions
20 1.5 cgd * are met:
21 1.5 cgd * 1. Redistributions of source code must retain the above copyright
22 1.5 cgd * notice, this list of conditions and the following disclaimer.
23 1.5 cgd * 2. Redistributions in binary form must reproduce the above copyright
24 1.5 cgd * notice, this list of conditions and the following disclaimer in the
25 1.5 cgd * documentation and/or other materials provided with the distribution.
26 1.18 agc * 3. Neither the name of the University nor the names of its contributors
27 1.18 agc * may be used to endorse or promote products derived from this software
28 1.18 agc * without specific prior written permission.
29 1.18 agc *
30 1.18 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 1.18 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 1.18 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 1.18 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 1.18 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 1.18 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 1.18 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 1.18 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 1.18 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 1.18 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 1.18 agc * SUCH DAMAGE.
41 1.18 agc *
42 1.18 agc * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 1.18 agc */
44 1.18 agc
45 1.45 christos #if HAVE_NBTOOL_CONFIG_H
46 1.45 christos #include "nbtool_config.h"
47 1.45 christos #endif
48 1.45 christos
49 1.7 christos #include <sys/cdefs.h>
50 1.6 cgd #if 0
51 1.5 cgd static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
52 1.39 christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53 1.6 cgd #endif
54 1.46 christos __RCSID("$NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $");
55 1.39 christos
56 1.39 christos #define _OPENBSD_SOURCE
57 1.42 christos
58 1.42 christos #ifndef LIBHACK
59 1.39 christos #define REGEX_GNU_EXTENSIONS
60 1.5 cgd
61 1.8 jtc #include "namespace.h"
62 1.42 christos #endif
63 1.1 jtc #include <sys/types.h>
64 1.39 christos #include <stdio.h>
65 1.39 christos #include <string.h>
66 1.1 jtc #include <ctype.h>
67 1.1 jtc #include <limits.h>
68 1.1 jtc #include <stdlib.h>
69 1.28 junyoung #include <regex.h>
70 1.39 christos #include <stdbool.h>
71 1.8 jtc
72 1.44 christos #if defined(__weak_alias) && !defined(LIBHACK)
73 1.16 mycroft __weak_alias(regcomp,_regcomp)
74 1.8 jtc #endif
75 1.1 jtc
76 1.39 christos #ifdef REGEX_LIBC_COLLATE
77 1.39 christos #include "collate.h"
78 1.39 christos #endif
79 1.39 christos
80 1.1 jtc #include "utils.h"
81 1.1 jtc #include "regex2.h"
82 1.1 jtc
83 1.1 jtc #include "cname.h"
84 1.1 jtc
85 1.1 jtc /*
86 1.39 christos * Branching context, used to keep track of branch state for all of the branch-
87 1.39 christos * aware functions. In addition to keeping track of branch positions for the
88 1.39 christos * p_branch_* functions, we use this to simplify some clumsiness in BREs for
89 1.39 christos * detection of whether ^ is acting as an anchor or being used erroneously and
90 1.39 christos * also for whether we're in a sub-expression or not.
91 1.39 christos */
92 1.39 christos struct branchc {
93 1.39 christos sopno start;
94 1.39 christos sopno back;
95 1.39 christos sopno fwd;
96 1.39 christos
97 1.39 christos int nbranch;
98 1.39 christos int nchain;
99 1.39 christos bool outer;
100 1.39 christos bool terminate;
101 1.39 christos };
102 1.39 christos
103 1.39 christos /*
104 1.1 jtc * parse structure, passed up and down to avoid global variables and
105 1.1 jtc * other clumsinesses
106 1.1 jtc */
107 1.1 jtc struct parse {
108 1.21 yamt const char *next; /* next character in RE */
109 1.21 yamt const char *end; /* end of string (-> NUL normally) */
110 1.1 jtc int error; /* has an error been seen? */
111 1.39 christos int gnuext;
112 1.1 jtc sop *strip; /* malloced strip */
113 1.1 jtc sopno ssize; /* malloced strip size (allocated) */
114 1.1 jtc sopno slen; /* malloced strip length (used) */
115 1.33 christos size_t ncsalloc; /* number of csets allocated */
116 1.1 jtc struct re_guts *g;
117 1.1 jtc # define NPAREN 10 /* we need to remember () 1-9 for back refs */
118 1.1 jtc sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
119 1.1 jtc sopno pend[NPAREN]; /* -> ) ([0] unused) */
120 1.39 christos bool allowbranch; /* can this expression branch? */
121 1.39 christos bool bre; /* convenience; is this a BRE? */
122 1.39 christos int pflags; /* other parsing flags -- legacy escapes? */
123 1.39 christos bool (*parse_expr)(struct parse *, struct branchc *);
124 1.39 christos void (*pre_parse)(struct parse *, struct branchc *);
125 1.39 christos void (*post_parse)(struct parse *, struct branchc *);
126 1.1 jtc };
127 1.1 jtc
128 1.39 christos #define PFLAG_LEGACY_ESC 0x00000001
129 1.39 christos
130 1.5 cgd /* ========= begin header generated by ./mkh ========= */
131 1.5 cgd #ifdef __cplusplus
132 1.5 cgd extern "C" {
133 1.5 cgd #endif
134 1.5 cgd
135 1.5 cgd /* === regcomp.c === */
136 1.39 christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
137 1.26 junyoung static void p_str(struct parse *p);
138 1.39 christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
139 1.39 christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
140 1.39 christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
141 1.39 christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
142 1.39 christos static bool p_branch_do(struct parse *p, struct branchc *bc);
143 1.39 christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
144 1.39 christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
145 1.39 christos static void p_re(struct parse *p, int end1, int end2);
146 1.39 christos static bool p_simp_re(struct parse *p, struct branchc *bc);
147 1.26 junyoung static int p_count(struct parse *p);
148 1.26 junyoung static void p_bracket(struct parse *p);
149 1.39 christos static int p_range_cmp(wchar_t c1, wchar_t c2);
150 1.26 junyoung static void p_b_term(struct parse *p, cset *cs);
151 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
152 1.39 christos static int p_b_pseudoclass(struct parse *p, char c);
153 1.39 christos #endif
154 1.26 junyoung static void p_b_cclass(struct parse *p, cset *cs);
155 1.39 christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
156 1.26 junyoung static void p_b_eclass(struct parse *p, cset *cs);
157 1.39 christos static wint_t p_b_symbol(struct parse *p);
158 1.39 christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
159 1.39 christos static bool may_escape(struct parse *p, const wint_t ch);
160 1.39 christos static wint_t othercase(wint_t ch);
161 1.39 christos static void bothcases(struct parse *p, wint_t ch);
162 1.39 christos static void ordinary(struct parse *p, wint_t ch);
163 1.26 junyoung static void nonnewline(struct parse *p);
164 1.39 christos static void repeat(struct parse *p, sopno start, int from, int to);
165 1.26 junyoung static int seterr(struct parse *p, int e);
166 1.26 junyoung static cset *allocset(struct parse *p);
167 1.26 junyoung static void freeset(struct parse *p, cset *cs);
168 1.39 christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
169 1.39 christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
170 1.39 christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
171 1.39 christos static wint_t singleton(cset *cs);
172 1.26 junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
173 1.39 christos static void doemit(struct parse *p, sop op, size_t opnd);
174 1.39 christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
175 1.39 christos static void dofwd(struct parse *p, sopno pos, sop value);
176 1.30 christos static int enlarge(struct parse *p, sopno size);
177 1.26 junyoung static void stripsnug(struct parse *p, struct re_guts *g);
178 1.26 junyoung static void findmust(struct parse *p, struct re_guts *g);
179 1.39 christos static int altoffset(sop *scan, int offset);
180 1.39 christos static void computejumps(struct parse *p, struct re_guts *g);
181 1.39 christos static void computematchjumps(struct parse *p, struct re_guts *g);
182 1.26 junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
183 1.39 christos static wint_t wgetnext(struct parse *p);
184 1.5 cgd
185 1.5 cgd #ifdef __cplusplus
186 1.5 cgd }
187 1.5 cgd #endif
188 1.5 cgd /* ========= end header generated by ./mkh ========= */
189 1.1 jtc
190 1.1 jtc static char nuls[10]; /* place to point scanner in event of error */
191 1.1 jtc
192 1.1 jtc /*
193 1.1 jtc * macros for use with parse structure
194 1.1 jtc * BEWARE: these know that the parse structure is named `p' !!!
195 1.1 jtc */
196 1.1 jtc #define PEEK() (*p->next)
197 1.1 jtc #define PEEK2() (*(p->next+1))
198 1.1 jtc #define MORE() (p->next < p->end)
199 1.1 jtc #define MORE2() (p->next+1 < p->end)
200 1.1 jtc #define SEE(c) (MORE() && PEEK() == (c))
201 1.1 jtc #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
202 1.39 christos #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
203 1.1 jtc #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
204 1.1 jtc #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
205 1.39 christos #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
206 1.1 jtc #define NEXT() (p->next++)
207 1.1 jtc #define NEXT2() (p->next += 2)
208 1.1 jtc #define NEXTn(n) (p->next += (n))
209 1.1 jtc #define GETNEXT() (*p->next++)
210 1.39 christos #define WGETNEXT() wgetnext(p)
211 1.1 jtc #define SETERROR(e) seterr(p, (e))
212 1.39 christos #define REQUIRE(co, e) ((co) || SETERROR(e))
213 1.1 jtc #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
214 1.39 christos #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
215 1.1 jtc #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
216 1.40 christos #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
217 1.40 christos #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
218 1.12 drochner #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
219 1.1 jtc #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
220 1.1 jtc #define HERE() (p->slen)
221 1.1 jtc #define THERE() (p->slen - 1)
222 1.4 jtc #define THERETHERE() (p->slen - 2)
223 1.1 jtc #define DROP(n) (p->slen -= (n))
224 1.1 jtc
225 1.39 christos /* Macro used by computejump()/computematchjump() */
226 1.41 christos #ifndef MIN
227 1.39 christos #define MIN(a,b) ((a)<(b)?(a):(b))
228 1.41 christos #endif
229 1.30 christos
230 1.42 christos #ifndef NLS
231 1.42 christos static const struct {
232 1.42 christos const char *name;
233 1.42 christos int (*func)(int);
234 1.42 christos } wctypes[] = {
235 1.42 christos #define ADD(x) { .name = # x, .func = is ## x }
236 1.42 christos ADD(alnum),
237 1.42 christos ADD(alpha),
238 1.42 christos ADD(blank),
239 1.42 christos ADD(cntrl),
240 1.42 christos ADD(digit),
241 1.42 christos ADD(graph),
242 1.42 christos ADD(lower),
243 1.42 christos ADD(print),
244 1.42 christos ADD(punct),
245 1.42 christos ADD(space),
246 1.42 christos ADD(upper),
247 1.42 christos ADD(xdigit),
248 1.42 christos #undef ADD
249 1.42 christos };
250 1.42 christos
251 1.42 christos wctype_t
252 1.42 christos __regex_wctype(const char *str)
253 1.42 christos {
254 1.42 christos for (size_t i = 0; i < __arraycount(wctypes); i++) {
255 1.42 christos if (strcmp(wctypes[i].name, str) == 0)
256 1.42 christos return (wctype_t)(i + 1);
257 1.42 christos }
258 1.42 christos return (wctype_t)0;
259 1.42 christos }
260 1.42 christos
261 1.42 christos int
262 1.42 christos __regex_iswctype(wint_t c, wctype_t ct)
263 1.42 christos {
264 1.42 christos if (ct == 0)
265 1.42 christos return 0;
266 1.42 christos return (*wctypes[ct - 1].func)(c);
267 1.42 christos }
268 1.42 christos #endif
269 1.42 christos
270 1.39 christos static int /* 0 success, otherwise REG_something */
271 1.39 christos regcomp_internal(regex_t * __restrict preg,
272 1.39 christos const char * __restrict pattern,
273 1.39 christos int cflags, int pflags)
274 1.1 jtc {
275 1.1 jtc struct parse pa;
276 1.9 perry struct re_guts *g;
277 1.9 perry struct parse *p = &pa;
278 1.9 perry int i;
279 1.9 perry size_t len;
280 1.39 christos size_t maxlen;
281 1.3 jtc #ifdef REDEBUG
282 1.3 jtc # define GOODFLAGS(f) (f)
283 1.3 jtc #else
284 1.3 jtc # define GOODFLAGS(f) ((f)&~REG_DUMP)
285 1.3 jtc #endif
286 1.1 jtc
287 1.14 lukem _DIAGASSERT(preg != NULL);
288 1.14 lukem _DIAGASSERT(pattern != NULL);
289 1.14 lukem
290 1.3 jtc cflags = GOODFLAGS(cflags);
291 1.1 jtc if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
292 1.1 jtc return(REG_INVARG);
293 1.1 jtc
294 1.1 jtc if (cflags®_PEND) {
295 1.1 jtc if (preg->re_endp < pattern)
296 1.1 jtc return(REG_INVARG);
297 1.1 jtc len = preg->re_endp - pattern;
298 1.1 jtc } else
299 1.11 christos len = strlen(pattern);
300 1.1 jtc
301 1.1 jtc /* do the mallocs early so failure handling is easy */
302 1.39 christos g = malloc(sizeof(*g));
303 1.1 jtc if (g == NULL)
304 1.1 jtc return(REG_ESPACE);
305 1.39 christos /*
306 1.39 christos * Limit the pattern space to avoid a 32-bit overflow on buffer
307 1.39 christos * extension. Also avoid any signed overflow in case of conversion
308 1.39 christos * so make the real limit based on a 31-bit overflow.
309 1.39 christos *
310 1.39 christos * Likely not applicable on 64-bit systems but handle the case
311 1.39 christos * generically (who are we to stop people from using ~715MB+
312 1.39 christos * patterns?).
313 1.39 christos */
314 1.40 christos maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
315 1.39 christos if (len >= maxlen) {
316 1.40 christos free(g);
317 1.39 christos return(REG_ESPACE);
318 1.39 christos }
319 1.40 christos p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
320 1.39 christos assert(p->ssize >= len);
321 1.39 christos
322 1.39 christos p->strip = calloc(p->ssize, sizeof(*p->strip));
323 1.1 jtc p->slen = 0;
324 1.1 jtc if (p->strip == NULL) {
325 1.40 christos free(g);
326 1.1 jtc return(REG_ESPACE);
327 1.1 jtc }
328 1.1 jtc
329 1.1 jtc /* set things up */
330 1.1 jtc p->g = g;
331 1.39 christos p->next = pattern; /* convenience; we do not modify it */
332 1.1 jtc p->end = p->next + len;
333 1.1 jtc p->error = 0;
334 1.1 jtc p->ncsalloc = 0;
335 1.39 christos p->pflags = pflags;
336 1.1 jtc for (i = 0; i < NPAREN; i++) {
337 1.1 jtc p->pbegin[i] = 0;
338 1.1 jtc p->pend[i] = 0;
339 1.1 jtc }
340 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
341 1.39 christos if ((cflags & REG_GNU) == 0) {
342 1.39 christos p->gnuext = false;
343 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
344 1.39 christos } else
345 1.39 christos p->gnuext = p->allowbranch = true;
346 1.39 christos #else
347 1.39 christos p->gnuext = false;
348 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
349 1.39 christos #endif
350 1.39 christos if (cflags & REG_EXTENDED) {
351 1.39 christos p->bre = false;
352 1.39 christos p->parse_expr = p_ere_exp;
353 1.39 christos p->pre_parse = NULL;
354 1.39 christos p->post_parse = NULL;
355 1.39 christos } else {
356 1.39 christos p->bre = true;
357 1.39 christos p->parse_expr = p_simp_re;
358 1.39 christos p->pre_parse = p_bre_pre_parse;
359 1.39 christos p->post_parse = p_bre_post_parse;
360 1.39 christos }
361 1.1 jtc g->sets = NULL;
362 1.1 jtc g->ncsets = 0;
363 1.1 jtc g->cflags = cflags;
364 1.1 jtc g->iflags = 0;
365 1.1 jtc g->nbol = 0;
366 1.1 jtc g->neol = 0;
367 1.1 jtc g->must = NULL;
368 1.39 christos g->moffset = -1;
369 1.39 christos g->charjump = NULL;
370 1.39 christos g->matchjump = NULL;
371 1.1 jtc g->mlen = 0;
372 1.1 jtc g->nsub = 0;
373 1.1 jtc g->backrefs = 0;
374 1.1 jtc
375 1.1 jtc /* do it */
376 1.1 jtc EMIT(OEND, 0);
377 1.1 jtc g->firststate = THERE();
378 1.39 christos if (cflags & REG_NOSPEC)
379 1.1 jtc p_str(p);
380 1.1 jtc else
381 1.39 christos p_re(p, OUT, OUT);
382 1.1 jtc EMIT(OEND, 0);
383 1.1 jtc g->laststate = THERE();
384 1.1 jtc
385 1.1 jtc /* tidy up loose ends and fill things in */
386 1.1 jtc stripsnug(p, g);
387 1.1 jtc findmust(p, g);
388 1.39 christos /* only use Boyer-Moore algorithm if the pattern is bigger
389 1.39 christos * than three characters
390 1.39 christos */
391 1.39 christos if(g->mlen > 3) {
392 1.39 christos computejumps(p, g);
393 1.39 christos computematchjumps(p, g);
394 1.39 christos if(g->matchjump == NULL && g->charjump != NULL) {
395 1.39 christos free(g->charjump);
396 1.39 christos g->charjump = NULL;
397 1.39 christos }
398 1.39 christos }
399 1.1 jtc g->nplus = pluscount(p, g);
400 1.1 jtc g->magic = MAGIC2;
401 1.1 jtc preg->re_nsub = g->nsub;
402 1.1 jtc preg->re_g = g;
403 1.1 jtc preg->re_magic = MAGIC1;
404 1.1 jtc #ifndef REDEBUG
405 1.1 jtc /* not debugging, so can't rely on the assert() in regexec() */
406 1.1 jtc if (g->iflags&BAD)
407 1.1 jtc SETERROR(REG_ASSERT);
408 1.1 jtc #endif
409 1.1 jtc
410 1.1 jtc /* win or lose, we're done */
411 1.1 jtc if (p->error != 0) /* lose */
412 1.1 jtc regfree(preg);
413 1.1 jtc return(p->error);
414 1.1 jtc }
415 1.1 jtc
416 1.1 jtc /*
417 1.39 christos - regcomp - interface for parser and compilation
418 1.39 christos = extern int regcomp(regex_t *, const char *, int);
419 1.39 christos = #define REG_BASIC 0000
420 1.39 christos = #define REG_EXTENDED 0001
421 1.39 christos = #define REG_ICASE 0002
422 1.39 christos = #define REG_NOSUB 0004
423 1.39 christos = #define REG_NEWLINE 0010
424 1.39 christos = #define REG_NOSPEC 0020
425 1.39 christos = #define REG_PEND 0040
426 1.39 christos = #define REG_DUMP 0200
427 1.1 jtc */
428 1.39 christos int /* 0 success, otherwise REG_something */
429 1.39 christos regcomp(regex_t * __restrict preg,
430 1.39 christos const char * __restrict pattern,
431 1.39 christos int cflags)
432 1.1 jtc {
433 1.30 christos
434 1.39 christos return (regcomp_internal(preg, pattern, cflags, 0));
435 1.1 jtc }
436 1.1 jtc
437 1.1 jtc /*
438 1.39 christos - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
439 1.39 christos - return whether we should terminate or not
440 1.39 christos == static bool p_ere_exp(struct parse *p);
441 1.1 jtc */
442 1.39 christos static bool
443 1.39 christos p_ere_exp(struct parse *p, struct branchc *bc)
444 1.1 jtc {
445 1.9 perry char c;
446 1.39 christos wint_t wc;
447 1.9 perry sopno pos;
448 1.9 perry int count;
449 1.9 perry int count2;
450 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
451 1.40 christos size_t i;
452 1.39 christos int handled;
453 1.39 christos #endif
454 1.9 perry sopno subno;
455 1.1 jtc int wascaret = 0;
456 1.1 jtc
457 1.14 lukem _DIAGASSERT(p != NULL);
458 1.14 lukem
459 1.39 christos (void)bc;
460 1.1 jtc assert(MORE()); /* caller should have ensured this */
461 1.1 jtc c = GETNEXT();
462 1.1 jtc
463 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
464 1.39 christos handled = 0;
465 1.39 christos #endif
466 1.1 jtc pos = HERE();
467 1.1 jtc switch (c) {
468 1.1 jtc case '(':
469 1.39 christos (void)REQUIRE(MORE(), REG_EPAREN);
470 1.1 jtc p->g->nsub++;
471 1.40 christos subno = (sopno)p->g->nsub;
472 1.1 jtc if (subno < NPAREN)
473 1.1 jtc p->pbegin[subno] = HERE();
474 1.1 jtc EMIT(OLPAREN, subno);
475 1.1 jtc if (!SEE(')'))
476 1.39 christos p_re(p, ')', IGN);
477 1.1 jtc if (subno < NPAREN) {
478 1.1 jtc p->pend[subno] = HERE();
479 1.1 jtc assert(p->pend[subno] != 0);
480 1.1 jtc }
481 1.1 jtc EMIT(ORPAREN, subno);
482 1.39 christos (void)MUSTEAT(')', REG_EPAREN);
483 1.1 jtc break;
484 1.1 jtc #ifndef POSIX_MISTAKE
485 1.1 jtc case ')': /* happens only if no current unmatched ( */
486 1.1 jtc /*
487 1.1 jtc * You may ask, why the ifndef? Because I didn't notice
488 1.1 jtc * this until slightly too late for 1003.2, and none of the
489 1.1 jtc * other 1003.2 regular-expression reviewers noticed it at
490 1.1 jtc * all. So an unmatched ) is legal POSIX, at least until
491 1.1 jtc * we can get it fixed.
492 1.1 jtc */
493 1.1 jtc SETERROR(REG_EPAREN);
494 1.1 jtc break;
495 1.1 jtc #endif
496 1.1 jtc case '^':
497 1.1 jtc EMIT(OBOL, 0);
498 1.1 jtc p->g->iflags |= USEBOL;
499 1.1 jtc p->g->nbol++;
500 1.1 jtc wascaret = 1;
501 1.1 jtc break;
502 1.1 jtc case '$':
503 1.1 jtc EMIT(OEOL, 0);
504 1.1 jtc p->g->iflags |= USEEOL;
505 1.1 jtc p->g->neol++;
506 1.1 jtc break;
507 1.1 jtc case '|':
508 1.1 jtc SETERROR(REG_EMPTY);
509 1.1 jtc break;
510 1.1 jtc case '*':
511 1.1 jtc case '+':
512 1.1 jtc case '?':
513 1.39 christos case '{':
514 1.1 jtc SETERROR(REG_BADRPT);
515 1.1 jtc break;
516 1.1 jtc case '.':
517 1.1 jtc if (p->g->cflags®_NEWLINE)
518 1.1 jtc nonnewline(p);
519 1.1 jtc else
520 1.1 jtc EMIT(OANY, 0);
521 1.1 jtc break;
522 1.1 jtc case '[':
523 1.1 jtc p_bracket(p);
524 1.1 jtc break;
525 1.1 jtc case '\\':
526 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
527 1.39 christos wc = WGETNEXT();
528 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
529 1.39 christos if (p->gnuext) {
530 1.39 christos handled = 1;
531 1.39 christos switch (wc) {
532 1.39 christos case '`':
533 1.39 christos EMIT(OBOS, 0);
534 1.39 christos break;
535 1.39 christos case '\'':
536 1.39 christos EMIT(OEOS, 0);
537 1.39 christos break;
538 1.39 christos case 'B':
539 1.39 christos EMIT(ONWBND, 0);
540 1.39 christos break;
541 1.39 christos case 'b':
542 1.39 christos EMIT(OWBND, 0);
543 1.39 christos break;
544 1.39 christos case 'W':
545 1.39 christos case 'w':
546 1.39 christos case 'S':
547 1.39 christos case 's':
548 1.39 christos p_b_pseudoclass(p, wc);
549 1.39 christos break;
550 1.46 christos case 'a':
551 1.46 christos ordinary(p, '\a');
552 1.46 christos break;
553 1.46 christos case 'e':
554 1.46 christos ordinary(p, '\e');
555 1.46 christos break;
556 1.46 christos case 'f':
557 1.46 christos ordinary(p, '\f');
558 1.46 christos break;
559 1.46 christos case 'n':
560 1.46 christos ordinary(p, '\n');
561 1.46 christos break;
562 1.46 christos case 'r':
563 1.46 christos ordinary(p, '\r');
564 1.46 christos break;
565 1.46 christos case 't':
566 1.46 christos ordinary(p, '\t');
567 1.46 christos break;
568 1.46 christos case 'v':
569 1.46 christos ordinary(p, '\v');
570 1.46 christos break;
571 1.39 christos case '1':
572 1.39 christos case '2':
573 1.39 christos case '3':
574 1.39 christos case '4':
575 1.39 christos case '5':
576 1.39 christos case '6':
577 1.39 christos case '7':
578 1.39 christos case '8':
579 1.39 christos case '9':
580 1.39 christos i = wc - '0';
581 1.39 christos assert(i < NPAREN);
582 1.39 christos if (p->pend[i] != 0) {
583 1.39 christos assert(i <= p->g->nsub);
584 1.39 christos EMIT(OBACK_, i);
585 1.39 christos assert(p->pbegin[i] != 0);
586 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
587 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
588 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
589 1.39 christos EMIT(O_BACK, i);
590 1.39 christos } else
591 1.39 christos SETERROR(REG_ESUBREG);
592 1.39 christos p->g->backrefs = 1;
593 1.39 christos break;
594 1.39 christos default:
595 1.39 christos handled = 0;
596 1.39 christos }
597 1.39 christos /* Don't proceed to the POSIX bits if we've already handled it */
598 1.39 christos if (handled)
599 1.39 christos break;
600 1.39 christos }
601 1.39 christos #endif
602 1.39 christos switch (wc) {
603 1.39 christos case '<':
604 1.39 christos EMIT(OBOW, 0);
605 1.39 christos break;
606 1.39 christos case '>':
607 1.39 christos EMIT(OEOW, 0);
608 1.39 christos break;
609 1.39 christos default:
610 1.39 christos if (may_escape(p, wc))
611 1.39 christos ordinary(p, wc);
612 1.39 christos else
613 1.39 christos SETERROR(REG_EESCAPE);
614 1.39 christos break;
615 1.39 christos }
616 1.1 jtc break;
617 1.1 jtc default:
618 1.38 christos if (p->error != 0)
619 1.39 christos return (false);
620 1.39 christos p->next--;
621 1.39 christos wc = WGETNEXT();
622 1.39 christos ordinary(p, wc);
623 1.1 jtc break;
624 1.1 jtc }
625 1.1 jtc
626 1.1 jtc if (!MORE())
627 1.39 christos return (false);
628 1.1 jtc c = PEEK();
629 1.1 jtc /* we call { a repetition if followed by a digit */
630 1.39 christos if (!( c == '*' || c == '+' || c == '?' || c == '{'))
631 1.39 christos return (false); /* no repetition, we're done */
632 1.39 christos else if (c == '{')
633 1.39 christos (void)REQUIRE(MORE2() && \
634 1.39 christos (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
635 1.1 jtc NEXT();
636 1.1 jtc
637 1.39 christos (void)REQUIRE(!wascaret, REG_BADRPT);
638 1.1 jtc switch (c) {
639 1.1 jtc case '*': /* implemented as +? */
640 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
641 1.1 jtc INSERT(OPLUS_, pos);
642 1.1 jtc ASTERN(O_PLUS, pos);
643 1.1 jtc INSERT(OQUEST_, pos);
644 1.1 jtc ASTERN(O_QUEST, pos);
645 1.1 jtc break;
646 1.1 jtc case '+':
647 1.1 jtc INSERT(OPLUS_, pos);
648 1.1 jtc ASTERN(O_PLUS, pos);
649 1.1 jtc break;
650 1.1 jtc case '?':
651 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
652 1.4 jtc INSERT(OCH_, pos); /* offset slightly wrong */
653 1.4 jtc ASTERN(OOR1, pos); /* this one's right */
654 1.4 jtc AHEAD(pos); /* fix the OCH_ */
655 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
656 1.4 jtc AHEAD(THERE()); /* ...so fix it */
657 1.4 jtc ASTERN(O_CH, THERETHERE());
658 1.1 jtc break;
659 1.1 jtc case '{':
660 1.1 jtc count = p_count(p);
661 1.1 jtc if (EAT(',')) {
662 1.39 christos if (isdigit((uch)PEEK())) {
663 1.1 jtc count2 = p_count(p);
664 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
665 1.1 jtc } else /* single number with comma */
666 1.1 jtc count2 = INFINITY;
667 1.1 jtc } else /* just a single number */
668 1.1 jtc count2 = count;
669 1.39 christos repeat(p, pos, count, count2);
670 1.1 jtc if (!EAT('}')) { /* error heuristics */
671 1.1 jtc while (MORE() && PEEK() != '}')
672 1.1 jtc NEXT();
673 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
674 1.1 jtc SETERROR(REG_BADBR);
675 1.1 jtc }
676 1.1 jtc break;
677 1.1 jtc }
678 1.1 jtc
679 1.1 jtc if (!MORE())
680 1.39 christos return (false);
681 1.1 jtc c = PEEK();
682 1.1 jtc if (!( c == '*' || c == '+' || c == '?' ||
683 1.39 christos (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
684 1.39 christos return (false);
685 1.1 jtc SETERROR(REG_BADRPT);
686 1.39 christos return (false);
687 1.1 jtc }
688 1.1 jtc
689 1.1 jtc /*
690 1.1 jtc - p_str - string (no metacharacters) "parser"
691 1.9 perry == static void p_str(struct parse *p);
692 1.1 jtc */
693 1.1 jtc static void
694 1.39 christos p_str(struct parse *p)
695 1.1 jtc {
696 1.39 christos (void)REQUIRE(MORE(), REG_EMPTY);
697 1.39 christos while (MORE())
698 1.39 christos ordinary(p, WGETNEXT());
699 1.39 christos }
700 1.14 lukem
701 1.39 christos /*
702 1.39 christos * Eat consecutive branch delimiters for the kind of expression that we are
703 1.39 christos * parsing, return the number of delimiters that we ate.
704 1.39 christos */
705 1.39 christos static int
706 1.39 christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
707 1.39 christos {
708 1.39 christos int nskip;
709 1.14 lukem
710 1.39 christos (void)bc;
711 1.39 christos nskip = 0;
712 1.39 christos while (EATSPEC('|'))
713 1.39 christos ++nskip;
714 1.39 christos return (nskip);
715 1.1 jtc }
716 1.1 jtc
717 1.1 jtc /*
718 1.39 christos * Insert necessary branch book-keeping operations. This emits a
719 1.39 christos * bogus 'next' offset, since we still have more to parse
720 1.1 jtc */
721 1.1 jtc static void
722 1.39 christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
723 1.9 perry {
724 1.1 jtc
725 1.39 christos if (bc->nbranch == 0) {
726 1.39 christos INSERT(OCH_, bc->start); /* offset is wrong */
727 1.39 christos bc->fwd = bc->start;
728 1.39 christos bc->back = bc->start;
729 1.39 christos }
730 1.39 christos
731 1.39 christos ASTERN(OOR1, bc->back);
732 1.39 christos bc->back = THERE();
733 1.39 christos AHEAD(bc->fwd); /* fix previous offset */
734 1.39 christos bc->fwd = HERE();
735 1.39 christos EMIT(OOR2, 0); /* offset is very wrong */
736 1.39 christos ++bc->nbranch;
737 1.39 christos }
738 1.39 christos
739 1.39 christos /*
740 1.39 christos * Fix the offset of the tail branch, if we actually had any branches.
741 1.39 christos * This is to correct the bogus placeholder offset that we use.
742 1.39 christos */
743 1.39 christos static void
744 1.39 christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
745 1.39 christos {
746 1.14 lukem
747 1.39 christos /* Fix bogus offset at the tail if we actually have branches */
748 1.39 christos if (bc->nbranch > 0) {
749 1.39 christos AHEAD(bc->fwd);
750 1.39 christos ASTERN(O_CH, bc->back);
751 1.30 christos }
752 1.39 christos }
753 1.39 christos
754 1.39 christos /*
755 1.39 christos * Signal to the parser that an empty branch has been encountered; this will,
756 1.39 christos * in the future, be used to allow for more permissive behavior with empty
757 1.39 christos * branches. The return value should indicate whether parsing may continue
758 1.39 christos * or not.
759 1.39 christos */
760 1.39 christos static bool
761 1.39 christos p_branch_empty(struct parse *p, struct branchc *bc)
762 1.39 christos {
763 1.39 christos
764 1.39 christos (void)bc;
765 1.39 christos SETERROR(REG_EMPTY);
766 1.39 christos return (false);
767 1.39 christos }
768 1.39 christos
769 1.39 christos /*
770 1.39 christos * Take care of any branching requirements. This includes inserting the
771 1.39 christos * appropriate branching instructions as well as eating all of the branch
772 1.39 christos * delimiters until we either run out of pattern or need to parse more pattern.
773 1.39 christos */
774 1.39 christos static bool
775 1.39 christos p_branch_do(struct parse *p, struct branchc *bc)
776 1.39 christos {
777 1.39 christos int ate = 0;
778 1.39 christos
779 1.39 christos ate = p_branch_eat_delim(p, bc);
780 1.39 christos if (ate == 0)
781 1.39 christos return (false);
782 1.39 christos else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
783 1.39 christos /*
784 1.39 christos * Halt parsing only if we have an empty branch and p_branch_empty
785 1.39 christos * indicates that we must not continue. In the future, this will not
786 1.39 christos * necessarily be an error.
787 1.39 christos */
788 1.39 christos return (false);
789 1.39 christos p_branch_ins_offset(p, bc);
790 1.39 christos
791 1.39 christos return (true);
792 1.39 christos }
793 1.30 christos
794 1.39 christos static void
795 1.39 christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
796 1.39 christos {
797 1.14 lukem
798 1.40 christos (void)bc;
799 1.39 christos /*
800 1.39 christos * Does not move cleanly into expression parser because of
801 1.39 christos * ordinary interpration of * at the beginning position of
802 1.39 christos * an expression.
803 1.39 christos */
804 1.1 jtc if (EAT('^')) {
805 1.1 jtc EMIT(OBOL, 0);
806 1.1 jtc p->g->iflags |= USEBOL;
807 1.1 jtc p->g->nbol++;
808 1.1 jtc }
809 1.39 christos }
810 1.39 christos
811 1.39 christos static void
812 1.39 christos p_bre_post_parse(struct parse *p, struct branchc *bc)
813 1.39 christos {
814 1.39 christos
815 1.39 christos /* Expression is terminating due to EOL token */
816 1.39 christos if (bc->terminate) {
817 1.1 jtc DROP(1);
818 1.1 jtc EMIT(OEOL, 0);
819 1.1 jtc p->g->iflags |= USEEOL;
820 1.1 jtc p->g->neol++;
821 1.1 jtc }
822 1.39 christos }
823 1.1 jtc
824 1.39 christos /*
825 1.39 christos - p_re - Top level parser, concatenation and BRE anchoring
826 1.39 christos == static void p_re(struct parse *p, int end1, int end2);
827 1.39 christos * Giving end1 as OUT essentially eliminates the end1/end2 check.
828 1.39 christos *
829 1.39 christos * This implementation is a bit of a kludge, in that a trailing $ is first
830 1.39 christos * taken as an ordinary character and then revised to be an anchor.
831 1.39 christos * The amount of lookahead needed to avoid this kludge is excessive.
832 1.39 christos */
833 1.39 christos static void
834 1.39 christos p_re(struct parse *p,
835 1.39 christos int end1, /* first terminating character */
836 1.39 christos int end2) /* second terminating character; ignored for EREs */
837 1.39 christos {
838 1.39 christos struct branchc bc;
839 1.39 christos
840 1.39 christos bc.nbranch = 0;
841 1.39 christos if (end1 == OUT && end2 == OUT)
842 1.39 christos bc.outer = true;
843 1.39 christos else
844 1.39 christos bc.outer = false;
845 1.39 christos #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
846 1.39 christos for (;;) {
847 1.39 christos bc.start = HERE();
848 1.39 christos bc.nchain = 0;
849 1.39 christos bc.terminate = false;
850 1.39 christos if (p->pre_parse != NULL)
851 1.39 christos p->pre_parse(p, &bc);
852 1.39 christos while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
853 1.39 christos bc.terminate = p->parse_expr(p, &bc);
854 1.39 christos ++bc.nchain;
855 1.39 christos }
856 1.39 christos if (p->post_parse != NULL)
857 1.39 christos p->post_parse(p, &bc);
858 1.39 christos (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
859 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
860 1.39 christos if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
861 1.39 christos break;
862 1.39 christos #endif
863 1.39 christos if (!p->allowbranch)
864 1.39 christos break;
865 1.39 christos /*
866 1.39 christos * p_branch_do's return value indicates whether we should
867 1.39 christos * continue parsing or not. This is both for correctness and
868 1.39 christos * a slight optimization, because it will check if we've
869 1.39 christos * encountered an empty branch or the end of the string
870 1.39 christos * immediately following a branch delimiter.
871 1.39 christos */
872 1.39 christos if (!p_branch_do(p, &bc))
873 1.39 christos break;
874 1.39 christos }
875 1.39 christos #undef SEE_END
876 1.39 christos if (p->allowbranch)
877 1.39 christos p_branch_fix_tail(p, &bc);
878 1.39 christos assert(!MORE() || SEE(end1));
879 1.1 jtc }
880 1.1 jtc
881 1.1 jtc /*
882 1.1 jtc - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
883 1.39 christos == static bool p_simp_re(struct parse *p, struct branchc *bc);
884 1.1 jtc */
885 1.39 christos static bool /* was the simple RE an unbackslashed $? */
886 1.39 christos p_simp_re(struct parse *p, struct branchc *bc)
887 1.1 jtc {
888 1.9 perry int c;
889 1.39 christos int cc; /* convenient/control character */
890 1.9 perry int count;
891 1.9 perry int count2;
892 1.39 christos sopno pos;
893 1.39 christos bool handled;
894 1.40 christos size_t i;
895 1.39 christos wint_t wc;
896 1.9 perry sopno subno;
897 1.1 jtc # define BACKSL (1<<CHAR_BIT)
898 1.1 jtc
899 1.39 christos pos = HERE(); /* repetition op, if any, covers from here */
900 1.39 christos handled = false;
901 1.1 jtc
902 1.1 jtc assert(MORE()); /* caller should have ensured this */
903 1.1 jtc c = GETNEXT();
904 1.1 jtc if (c == '\\') {
905 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
906 1.39 christos cc = GETNEXT();
907 1.39 christos c = BACKSL | cc;
908 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
909 1.39 christos if (p->gnuext) {
910 1.39 christos handled = true;
911 1.39 christos switch (c) {
912 1.39 christos case BACKSL|'`':
913 1.39 christos EMIT(OBOS, 0);
914 1.39 christos break;
915 1.39 christos case BACKSL|'\'':
916 1.39 christos EMIT(OEOS, 0);
917 1.39 christos break;
918 1.39 christos case BACKSL|'B':
919 1.39 christos EMIT(ONWBND, 0);
920 1.39 christos break;
921 1.39 christos case BACKSL|'b':
922 1.39 christos EMIT(OWBND, 0);
923 1.39 christos break;
924 1.39 christos case BACKSL|'W':
925 1.39 christos case BACKSL|'w':
926 1.39 christos case BACKSL|'S':
927 1.39 christos case BACKSL|'s':
928 1.39 christos p_b_pseudoclass(p, cc);
929 1.39 christos break;
930 1.46 christos case BACKSL|'a':
931 1.46 christos ordinary(p, '\a');
932 1.46 christos break;
933 1.46 christos case BACKSL|'e':
934 1.46 christos ordinary(p, '\e');
935 1.46 christos break;
936 1.46 christos case BACKSL|'f':
937 1.46 christos ordinary(p, '\f');
938 1.46 christos break;
939 1.46 christos case BACKSL|'n':
940 1.46 christos ordinary(p, '\n');
941 1.46 christos break;
942 1.46 christos case BACKSL|'r':
943 1.46 christos ordinary(p, '\r');
944 1.46 christos break;
945 1.46 christos case BACKSL|'t':
946 1.46 christos ordinary(p, '\t');
947 1.46 christos break;
948 1.46 christos case BACKSL|'v':
949 1.46 christos ordinary(p, '\v');
950 1.46 christos break;
951 1.39 christos default:
952 1.39 christos handled = false;
953 1.39 christos }
954 1.39 christos }
955 1.39 christos #endif
956 1.1 jtc }
957 1.39 christos if (!handled) {
958 1.39 christos switch (c) {
959 1.39 christos case '.':
960 1.39 christos if (p->g->cflags®_NEWLINE)
961 1.39 christos nonnewline(p);
962 1.39 christos else
963 1.39 christos EMIT(OANY, 0);
964 1.39 christos break;
965 1.39 christos case '[':
966 1.39 christos p_bracket(p);
967 1.39 christos break;
968 1.39 christos case BACKSL|'<':
969 1.39 christos EMIT(OBOW, 0);
970 1.39 christos break;
971 1.39 christos case BACKSL|'>':
972 1.39 christos EMIT(OEOW, 0);
973 1.39 christos break;
974 1.39 christos case BACKSL|'{':
975 1.39 christos SETERROR(REG_BADRPT);
976 1.39 christos break;
977 1.39 christos case BACKSL|'(':
978 1.39 christos p->g->nsub++;
979 1.40 christos subno = (sopno)p->g->nsub;
980 1.39 christos if (subno < NPAREN)
981 1.39 christos p->pbegin[subno] = HERE();
982 1.39 christos EMIT(OLPAREN, subno);
983 1.39 christos /* the MORE here is an error heuristic */
984 1.39 christos if (MORE() && !SEETWO('\\', ')'))
985 1.39 christos p_re(p, '\\', ')');
986 1.39 christos if (subno < NPAREN) {
987 1.39 christos p->pend[subno] = HERE();
988 1.39 christos assert(p->pend[subno] != 0);
989 1.39 christos }
990 1.39 christos EMIT(ORPAREN, subno);
991 1.39 christos (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
992 1.39 christos break;
993 1.39 christos case BACKSL|')': /* should not get here -- must be user */
994 1.39 christos SETERROR(REG_EPAREN);
995 1.39 christos break;
996 1.39 christos case BACKSL|'1':
997 1.39 christos case BACKSL|'2':
998 1.39 christos case BACKSL|'3':
999 1.39 christos case BACKSL|'4':
1000 1.39 christos case BACKSL|'5':
1001 1.39 christos case BACKSL|'6':
1002 1.39 christos case BACKSL|'7':
1003 1.39 christos case BACKSL|'8':
1004 1.39 christos case BACKSL|'9':
1005 1.39 christos i = (c&~BACKSL) - '0';
1006 1.39 christos assert(i < NPAREN);
1007 1.39 christos if (p->pend[i] != 0) {
1008 1.39 christos assert(i <= p->g->nsub);
1009 1.39 christos EMIT(OBACK_, i);
1010 1.39 christos assert(p->pbegin[i] != 0);
1011 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
1012 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
1013 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
1014 1.39 christos EMIT(O_BACK, i);
1015 1.39 christos } else
1016 1.39 christos SETERROR(REG_ESUBREG);
1017 1.39 christos p->g->backrefs = 1;
1018 1.39 christos break;
1019 1.39 christos case '*':
1020 1.39 christos /*
1021 1.39 christos * Ordinary if used as the first character beyond BOL anchor of
1022 1.39 christos * a (sub-)expression, counts as a bad repetition operator if it
1023 1.39 christos * appears otherwise.
1024 1.39 christos */
1025 1.39 christos (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
1026 1.39 christos /* FALLTHROUGH */
1027 1.39 christos default:
1028 1.39 christos if (p->error != 0)
1029 1.39 christos return (false); /* Definitely not $... */
1030 1.39 christos p->next--;
1031 1.39 christos wc = WGETNEXT();
1032 1.39 christos if ((c & BACKSL) == 0 || may_escape(p, wc))
1033 1.39 christos ordinary(p, wc);
1034 1.39 christos else
1035 1.39 christos SETERROR(REG_EESCAPE);
1036 1.39 christos break;
1037 1.1 jtc }
1038 1.1 jtc }
1039 1.1 jtc
1040 1.1 jtc if (EAT('*')) { /* implemented as +? */
1041 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
1042 1.1 jtc INSERT(OPLUS_, pos);
1043 1.1 jtc ASTERN(O_PLUS, pos);
1044 1.1 jtc INSERT(OQUEST_, pos);
1045 1.1 jtc ASTERN(O_QUEST, pos);
1046 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1047 1.39 christos } else if (p->gnuext && EATTWO('\\', '?')) {
1048 1.39 christos INSERT(OQUEST_, pos);
1049 1.39 christos ASTERN(O_QUEST, pos);
1050 1.39 christos } else if (p->gnuext && EATTWO('\\', '+')) {
1051 1.39 christos INSERT(OPLUS_, pos);
1052 1.39 christos ASTERN(O_PLUS, pos);
1053 1.39 christos #endif
1054 1.1 jtc } else if (EATTWO('\\', '{')) {
1055 1.1 jtc count = p_count(p);
1056 1.1 jtc if (EAT(',')) {
1057 1.39 christos if (MORE() && isdigit((uch)PEEK())) {
1058 1.1 jtc count2 = p_count(p);
1059 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
1060 1.1 jtc } else /* single number with comma */
1061 1.1 jtc count2 = INFINITY;
1062 1.1 jtc } else /* just a single number */
1063 1.1 jtc count2 = count;
1064 1.39 christos repeat(p, pos, count, count2);
1065 1.1 jtc if (!EATTWO('\\', '}')) { /* error heuristics */
1066 1.1 jtc while (MORE() && !SEETWO('\\', '}'))
1067 1.1 jtc NEXT();
1068 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
1069 1.1 jtc SETERROR(REG_BADBR);
1070 1.1 jtc }
1071 1.39 christos } else if (c == '$') /* $ (but not \$) ends it */
1072 1.39 christos return (true);
1073 1.1 jtc
1074 1.39 christos return (false);
1075 1.1 jtc }
1076 1.1 jtc
1077 1.1 jtc /*
1078 1.1 jtc - p_count - parse a repetition count
1079 1.9 perry == static int p_count(struct parse *p);
1080 1.1 jtc */
1081 1.1 jtc static int /* the value */
1082 1.39 christos p_count(struct parse *p)
1083 1.1 jtc {
1084 1.9 perry int count = 0;
1085 1.9 perry int ndigits = 0;
1086 1.1 jtc
1087 1.39 christos while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1088 1.1 jtc count = count*10 + (GETNEXT() - '0');
1089 1.1 jtc ndigits++;
1090 1.1 jtc }
1091 1.1 jtc
1092 1.39 christos (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1093 1.1 jtc return(count);
1094 1.1 jtc }
1095 1.1 jtc
1096 1.1 jtc /*
1097 1.1 jtc - p_bracket - parse a bracketed character list
1098 1.9 perry == static void p_bracket(struct parse *p);
1099 1.1 jtc */
1100 1.1 jtc static void
1101 1.39 christos p_bracket(struct parse *p)
1102 1.1 jtc {
1103 1.14 lukem cset *cs;
1104 1.39 christos wint_t ch;
1105 1.14 lukem
1106 1.1 jtc /* Dept of Truly Sickening Special-Case Kludges */
1107 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1108 1.1 jtc EMIT(OBOW, 0);
1109 1.1 jtc NEXTn(6);
1110 1.1 jtc return;
1111 1.1 jtc }
1112 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1113 1.1 jtc EMIT(OEOW, 0);
1114 1.1 jtc NEXTn(6);
1115 1.1 jtc return;
1116 1.1 jtc }
1117 1.1 jtc
1118 1.39 christos if ((cs = allocset(p)) == NULL)
1119 1.39 christos return;
1120 1.39 christos
1121 1.39 christos if (p->g->cflags®_ICASE)
1122 1.39 christos cs->icase = 1;
1123 1.1 jtc if (EAT('^'))
1124 1.39 christos cs->invert = 1;
1125 1.1 jtc if (EAT(']'))
1126 1.39 christos CHadd(p, cs, ']');
1127 1.1 jtc else if (EAT('-'))
1128 1.39 christos CHadd(p, cs, '-');
1129 1.1 jtc while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1130 1.1 jtc p_b_term(p, cs);
1131 1.1 jtc if (EAT('-'))
1132 1.39 christos CHadd(p, cs, '-');
1133 1.39 christos (void)MUSTEAT(']', REG_EBRACK);
1134 1.1 jtc
1135 1.1 jtc if (p->error != 0) /* don't mess things up further */
1136 1.1 jtc return;
1137 1.1 jtc
1138 1.39 christos if (cs->invert && p->g->cflags®_NEWLINE)
1139 1.39 christos cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1140 1.1 jtc
1141 1.39 christos if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1142 1.39 christos ordinary(p, ch);
1143 1.39 christos freeset(p, cs);
1144 1.39 christos } else
1145 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1146 1.39 christos }
1147 1.1 jtc
1148 1.39 christos static int
1149 1.39 christos p_range_cmp(wchar_t c1, wchar_t c2)
1150 1.39 christos {
1151 1.39 christos #ifdef REGEX_LIBC_COLLATE
1152 1.39 christos return __wcollate_range_cmp(c1, c2);
1153 1.42 christos #elif defined(NLS)
1154 1.39 christos /* Copied from libc/collate __wcollate_range_cmp */
1155 1.39 christos wchar_t s1[2], s2[2];
1156 1.1 jtc
1157 1.39 christos s1[0] = c1;
1158 1.39 christos s1[1] = L'\0';
1159 1.39 christos s2[0] = c2;
1160 1.39 christos s2[1] = L'\0';
1161 1.42 christos return wcscoll(s1, s2);
1162 1.42 christos #else
1163 1.42 christos char s1[2], s2[2];
1164 1.42 christos
1165 1.42 christos s1[0] = (char)c1;
1166 1.42 christos s1[1] = '\0';
1167 1.42 christos s2[0] = (char)c2;
1168 1.42 christos s2[1] = '\0';
1169 1.42 christos return strcoll(s1, s2);
1170 1.39 christos #endif
1171 1.1 jtc }
1172 1.1 jtc
1173 1.1 jtc /*
1174 1.1 jtc - p_b_term - parse one term of a bracketed character list
1175 1.9 perry == static void p_b_term(struct parse *p, cset *cs);
1176 1.1 jtc */
1177 1.1 jtc static void
1178 1.39 christos p_b_term(struct parse *p, cset *cs)
1179 1.1 jtc {
1180 1.9 perry char c;
1181 1.39 christos wint_t start, finish;
1182 1.39 christos wint_t i;
1183 1.39 christos #ifdef REGEX_LIBC_COLLATE
1184 1.39 christos struct xlocale_collate *table =
1185 1.39 christos (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1186 1.39 christos #endif
1187 1.1 jtc
1188 1.14 lukem _DIAGASSERT(p != NULL);
1189 1.14 lukem _DIAGASSERT(cs != NULL);
1190 1.14 lukem
1191 1.1 jtc /* classify what we've got */
1192 1.1 jtc switch ((MORE()) ? PEEK() : '\0') {
1193 1.1 jtc case '[':
1194 1.1 jtc c = (MORE2()) ? PEEK2() : '\0';
1195 1.1 jtc break;
1196 1.1 jtc case '-':
1197 1.1 jtc SETERROR(REG_ERANGE);
1198 1.1 jtc return; /* NOTE RETURN */
1199 1.1 jtc default:
1200 1.1 jtc c = '\0';
1201 1.1 jtc break;
1202 1.1 jtc }
1203 1.1 jtc
1204 1.1 jtc switch (c) {
1205 1.1 jtc case ':': /* character class */
1206 1.1 jtc NEXT2();
1207 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1208 1.1 jtc c = PEEK();
1209 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1210 1.1 jtc p_b_cclass(p, cs);
1211 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1212 1.39 christos (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1213 1.1 jtc break;
1214 1.1 jtc case '=': /* equivalence class */
1215 1.1 jtc NEXT2();
1216 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1217 1.1 jtc c = PEEK();
1218 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1219 1.1 jtc p_b_eclass(p, cs);
1220 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1221 1.39 christos (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1222 1.1 jtc break;
1223 1.1 jtc default: /* symbol, ordinary character, or range */
1224 1.1 jtc start = p_b_symbol(p);
1225 1.1 jtc if (SEE('-') && MORE2() && PEEK2() != ']') {
1226 1.1 jtc /* range */
1227 1.1 jtc NEXT();
1228 1.1 jtc if (EAT('-'))
1229 1.1 jtc finish = '-';
1230 1.1 jtc else
1231 1.1 jtc finish = p_b_symbol(p);
1232 1.1 jtc } else
1233 1.1 jtc finish = start;
1234 1.39 christos if (start == finish)
1235 1.39 christos CHadd(p, cs, start);
1236 1.39 christos else {
1237 1.39 christos #ifdef REGEX_LIBC_COLLATE
1238 1.39 christos if (table->__collate_load_error || MB_CUR_MAX > 1) {
1239 1.39 christos #else
1240 1.39 christos if (MB_CUR_MAX > 1) {
1241 1.39 christos #endif
1242 1.39 christos (void)REQUIRE(start <= finish, REG_ERANGE);
1243 1.39 christos CHaddrange(p, cs, start, finish);
1244 1.39 christos } else {
1245 1.39 christos (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1246 1.39 christos for (i = 0; i <= UCHAR_MAX; i++) {
1247 1.39 christos if (p_range_cmp(start, i) <= 0 &&
1248 1.39 christos p_range_cmp(i, finish) <= 0 )
1249 1.39 christos CHadd(p, cs, i);
1250 1.39 christos }
1251 1.39 christos }
1252 1.39 christos }
1253 1.39 christos break;
1254 1.39 christos }
1255 1.39 christos }
1256 1.39 christos
1257 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1258 1.39 christos /*
1259 1.39 christos - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1260 1.39 christos == static int p_b_pseudoclass(struct parse *p, char c)
1261 1.39 christos */
1262 1.39 christos static int
1263 1.39 christos p_b_pseudoclass(struct parse *p, char c) {
1264 1.39 christos cset *cs;
1265 1.39 christos
1266 1.39 christos if ((cs = allocset(p)) == NULL)
1267 1.39 christos return(0);
1268 1.39 christos
1269 1.39 christos if (p->g->cflags®_ICASE)
1270 1.39 christos cs->icase = 1;
1271 1.39 christos
1272 1.39 christos switch (c) {
1273 1.39 christos case 'W':
1274 1.39 christos cs->invert = 1;
1275 1.39 christos /* FALLTHROUGH */
1276 1.39 christos case 'w':
1277 1.39 christos p_b_cclass_named(p, cs, "alnum");
1278 1.1 jtc break;
1279 1.39 christos case 'S':
1280 1.39 christos cs->invert = 1;
1281 1.39 christos /* FALLTHROUGH */
1282 1.39 christos case 's':
1283 1.39 christos p_b_cclass_named(p, cs, "space");
1284 1.39 christos break;
1285 1.39 christos default:
1286 1.39 christos return(0);
1287 1.1 jtc }
1288 1.39 christos
1289 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1290 1.39 christos return(1);
1291 1.1 jtc }
1292 1.39 christos #endif
1293 1.1 jtc
1294 1.1 jtc /*
1295 1.1 jtc - p_b_cclass - parse a character-class name and deal with it
1296 1.9 perry == static void p_b_cclass(struct parse *p, cset *cs);
1297 1.1 jtc */
1298 1.1 jtc static void
1299 1.39 christos p_b_cclass(struct parse *p, cset *cs)
1300 1.1 jtc {
1301 1.39 christos const char *sp = p->next;
1302 1.9 perry size_t len;
1303 1.39 christos char clname[16];
1304 1.1 jtc
1305 1.39 christos while (MORE() && isalpha((uch)PEEK()))
1306 1.1 jtc NEXT();
1307 1.1 jtc len = p->next - sp;
1308 1.39 christos if (len >= sizeof(clname) - 1) {
1309 1.1 jtc SETERROR(REG_ECTYPE);
1310 1.1 jtc return;
1311 1.1 jtc }
1312 1.39 christos memcpy(clname, sp, len);
1313 1.39 christos clname[len] = '\0';
1314 1.39 christos
1315 1.39 christos p_b_cclass_named(p, cs, clname);
1316 1.39 christos }
1317 1.42 christos
1318 1.39 christos /*
1319 1.39 christos - p_b_cclass_named - deal with a named character class
1320 1.39 christos == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1321 1.39 christos */
1322 1.39 christos static void
1323 1.39 christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1324 1.39 christos wctype_t wct;
1325 1.1 jtc
1326 1.39 christos if ((wct = wctype(clname)) == 0) {
1327 1.39 christos SETERROR(REG_ECTYPE);
1328 1.39 christos return;
1329 1.39 christos }
1330 1.39 christos CHaddtype(p, cs, wct);
1331 1.1 jtc }
1332 1.1 jtc
1333 1.1 jtc /*
1334 1.1 jtc - p_b_eclass - parse an equivalence-class name and deal with it
1335 1.9 perry == static void p_b_eclass(struct parse *p, cset *cs);
1336 1.1 jtc *
1337 1.1 jtc * This implementation is incomplete. xxx
1338 1.1 jtc */
1339 1.1 jtc static void
1340 1.39 christos p_b_eclass(struct parse *p, cset *cs)
1341 1.1 jtc {
1342 1.39 christos wint_t c;
1343 1.1 jtc
1344 1.14 lukem _DIAGASSERT(p != NULL);
1345 1.14 lukem _DIAGASSERT(cs != NULL);
1346 1.14 lukem
1347 1.1 jtc c = p_b_coll_elem(p, '=');
1348 1.39 christos CHadd(p, cs, c);
1349 1.1 jtc }
1350 1.1 jtc
1351 1.1 jtc /*
1352 1.1 jtc - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1353 1.39 christos == static wint_t p_b_symbol(struct parse *p);
1354 1.1 jtc */
1355 1.39 christos static wint_t /* value of symbol */
1356 1.39 christos p_b_symbol(struct parse *p)
1357 1.1 jtc {
1358 1.39 christos wint_t value;
1359 1.1 jtc
1360 1.14 lukem _DIAGASSERT(p != NULL);
1361 1.14 lukem
1362 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1363 1.1 jtc if (!EATTWO('[', '.'))
1364 1.39 christos return(WGETNEXT());
1365 1.1 jtc
1366 1.1 jtc /* collating symbol */
1367 1.1 jtc value = p_b_coll_elem(p, '.');
1368 1.39 christos (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1369 1.1 jtc return(value);
1370 1.1 jtc }
1371 1.1 jtc
1372 1.1 jtc /*
1373 1.1 jtc - p_b_coll_elem - parse a collating-element name and look it up
1374 1.39 christos == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1375 1.1 jtc */
1376 1.39 christos static wint_t /* value of collating element */
1377 1.39 christos p_b_coll_elem(struct parse *p,
1378 1.39 christos wint_t endc) /* name ended by endc,']' */
1379 1.39 christos {
1380 1.39 christos const char *sp = p->next;
1381 1.39 christos struct cname *cp;
1382 1.42 christos size_t len;
1383 1.1 jtc
1384 1.14 lukem _DIAGASSERT(p != NULL);
1385 1.14 lukem
1386 1.1 jtc while (MORE() && !SEETWO(endc, ']'))
1387 1.1 jtc NEXT();
1388 1.1 jtc if (!MORE()) {
1389 1.1 jtc SETERROR(REG_EBRACK);
1390 1.1 jtc return(0);
1391 1.1 jtc }
1392 1.1 jtc len = p->next - sp;
1393 1.1 jtc for (cp = cnames; cp->name != NULL; cp++)
1394 1.37 christos if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1395 1.1 jtc return(cp->code); /* known name */
1396 1.42 christos #ifdef NLS
1397 1.42 christos mbstate_t mbs;
1398 1.42 christos wchar_t wc;
1399 1.42 christos size_t clen;
1400 1.42 christos
1401 1.39 christos memset(&mbs, 0, sizeof(mbs));
1402 1.39 christos if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1403 1.39 christos return (wc); /* single character */
1404 1.39 christos else if (clen == (size_t)-1 || clen == (size_t)-2)
1405 1.39 christos SETERROR(REG_ILLSEQ);
1406 1.39 christos else
1407 1.39 christos SETERROR(REG_ECOLLATE); /* neither */
1408 1.1 jtc return(0);
1409 1.42 christos #else
1410 1.42 christos if (len == 1)
1411 1.42 christos return *sp; /* single character */
1412 1.42 christos SETERROR(REG_ECOLLATE); /* neither */
1413 1.42 christos return 0;
1414 1.42 christos #endif
1415 1.1 jtc }
1416 1.1 jtc
1417 1.1 jtc /*
1418 1.39 christos - may_escape - determine whether 'ch' is escape-able in the current context
1419 1.39 christos == static int may_escape(struct parse *p, const wint_t ch)
1420 1.39 christos */
1421 1.39 christos static bool
1422 1.39 christos may_escape(struct parse *p, const wint_t ch)
1423 1.39 christos {
1424 1.39 christos
1425 1.39 christos if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1426 1.39 christos return (true);
1427 1.39 christos if (isalpha(ch) || ch == '\'' || ch == '`')
1428 1.39 christos return (false);
1429 1.39 christos return (true);
1430 1.39 christos #ifdef NOTYET
1431 1.39 christos /*
1432 1.39 christos * Build a whitelist of characters that may be escaped to produce an
1433 1.39 christos * ordinary in the current context. This assumes that these have not
1434 1.39 christos * been otherwise interpreted as a special character. Escaping an
1435 1.39 christos * ordinary character yields undefined results according to
1436 1.39 christos * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1437 1.39 christos * advantage of this and use escaped ordinary characters to provide
1438 1.39 christos * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1439 1.39 christos */
1440 1.39 christos switch(ch) {
1441 1.39 christos case '|':
1442 1.39 christos case '+':
1443 1.39 christos case '?':
1444 1.39 christos /* The above characters may not be escaped in BREs */
1445 1.39 christos if (!(p->g->cflags®_EXTENDED))
1446 1.39 christos return (false);
1447 1.39 christos /* Fallthrough */
1448 1.39 christos case '(':
1449 1.39 christos case ')':
1450 1.39 christos case '{':
1451 1.39 christos case '}':
1452 1.39 christos case '.':
1453 1.39 christos case '[':
1454 1.39 christos case ']':
1455 1.39 christos case '\\':
1456 1.39 christos case '*':
1457 1.39 christos case '^':
1458 1.39 christos case '$':
1459 1.39 christos return (true);
1460 1.39 christos default:
1461 1.39 christos return (false);
1462 1.39 christos }
1463 1.39 christos #endif
1464 1.39 christos }
1465 1.39 christos
1466 1.39 christos /*
1467 1.1 jtc - othercase - return the case counterpart of an alphabetic
1468 1.39 christos == static wint_t othercase(wint_t ch);
1469 1.1 jtc */
1470 1.39 christos static wint_t /* if no counterpart, return ch */
1471 1.39 christos othercase(wint_t ch)
1472 1.39 christos {
1473 1.39 christos assert(iswalpha(ch));
1474 1.39 christos if (iswupper(ch))
1475 1.39 christos return(towlower(ch));
1476 1.39 christos else if (iswlower(ch))
1477 1.39 christos return(towupper(ch));
1478 1.1 jtc else /* peculiar, but could happen */
1479 1.1 jtc return(ch);
1480 1.1 jtc }
1481 1.1 jtc
1482 1.1 jtc /*
1483 1.1 jtc - bothcases - emit a dualcase version of a two-case character
1484 1.39 christos == static void bothcases(struct parse *p, wint_t ch);
1485 1.1 jtc *
1486 1.1 jtc * Boy, is this implementation ever a kludge...
1487 1.1 jtc */
1488 1.1 jtc static void
1489 1.39 christos bothcases(struct parse *p, wint_t ch)
1490 1.1 jtc {
1491 1.39 christos const char *oldnext = p->next;
1492 1.39 christos const char *oldend = p->end;
1493 1.39 christos char bracket[3 + MB_LEN_MAX];
1494 1.39 christos size_t n;
1495 1.1 jtc
1496 1.14 lukem _DIAGASSERT(p != NULL);
1497 1.14 lukem
1498 1.1 jtc assert(othercase(ch) != ch); /* p_bracket() would recurse */
1499 1.1 jtc p->next = bracket;
1500 1.42 christos #ifdef NLS
1501 1.42 christos mbstate_t mbs;
1502 1.39 christos memset(&mbs, 0, sizeof(mbs));
1503 1.39 christos n = wcrtomb(bracket, ch, &mbs);
1504 1.39 christos assert(n != (size_t)-1);
1505 1.42 christos #else
1506 1.42 christos n = 0;
1507 1.42 christos bracket[n++] = ch;
1508 1.42 christos #endif
1509 1.39 christos bracket[n] = ']';
1510 1.39 christos bracket[n + 1] = '\0';
1511 1.39 christos p->end = bracket+n+1;
1512 1.1 jtc p_bracket(p);
1513 1.39 christos assert(p->next == p->end);
1514 1.1 jtc p->next = oldnext;
1515 1.1 jtc p->end = oldend;
1516 1.1 jtc }
1517 1.1 jtc
1518 1.1 jtc /*
1519 1.1 jtc - ordinary - emit an ordinary character
1520 1.39 christos == static void ordinary(struct parse *p, wint_t ch);
1521 1.1 jtc */
1522 1.1 jtc static void
1523 1.39 christos ordinary(struct parse *p, wint_t ch)
1524 1.1 jtc {
1525 1.39 christos cset *cs;
1526 1.1 jtc
1527 1.14 lukem _DIAGASSERT(p != NULL);
1528 1.14 lukem
1529 1.39 christos if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1530 1.39 christos bothcases(p, ch);
1531 1.40 christos else if ((wint_t)(ch & OPDMASK) == ch)
1532 1.40 christos EMIT(OCHAR, (size_t)ch);
1533 1.1 jtc else {
1534 1.39 christos /*
1535 1.39 christos * Kludge: character is too big to fit into an OCHAR operand.
1536 1.39 christos * Emit a singleton set.
1537 1.39 christos */
1538 1.39 christos if ((cs = allocset(p)) == NULL)
1539 1.39 christos return;
1540 1.39 christos CHadd(p, cs, ch);
1541 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1542 1.1 jtc }
1543 1.1 jtc }
1544 1.1 jtc
1545 1.1 jtc /*
1546 1.1 jtc - nonnewline - emit REG_NEWLINE version of OANY
1547 1.9 perry == static void nonnewline(struct parse *p);
1548 1.1 jtc *
1549 1.1 jtc * Boy, is this implementation ever a kludge...
1550 1.1 jtc */
1551 1.1 jtc static void
1552 1.39 christos nonnewline(struct parse *p)
1553 1.1 jtc {
1554 1.39 christos const char *oldnext = p->next;
1555 1.39 christos const char *oldend = p->end;
1556 1.1 jtc char bracket[4];
1557 1.1 jtc
1558 1.14 lukem _DIAGASSERT(p != NULL);
1559 1.14 lukem
1560 1.1 jtc p->next = bracket;
1561 1.1 jtc p->end = bracket+3;
1562 1.1 jtc bracket[0] = '^';
1563 1.1 jtc bracket[1] = '\n';
1564 1.1 jtc bracket[2] = ']';
1565 1.1 jtc bracket[3] = '\0';
1566 1.1 jtc p_bracket(p);
1567 1.1 jtc assert(p->next == bracket+3);
1568 1.1 jtc p->next = oldnext;
1569 1.1 jtc p->end = oldend;
1570 1.1 jtc }
1571 1.1 jtc
1572 1.1 jtc /*
1573 1.1 jtc - repeat - generate code for a bounded repetition, recursively if needed
1574 1.39 christos == static void repeat(struct parse *p, sopno start, int from, int to);
1575 1.1 jtc */
1576 1.1 jtc static void
1577 1.39 christos repeat(struct parse *p,
1578 1.39 christos sopno start, /* operand from here to end of strip */
1579 1.39 christos int from, /* repeated from this number */
1580 1.39 christos int to) /* to this number of times (maybe INFINITY) */
1581 1.1 jtc {
1582 1.39 christos sopno finish = HERE();
1583 1.1 jtc # define N 2
1584 1.1 jtc # define INF 3
1585 1.1 jtc # define REP(f, t) ((f)*8 + (t))
1586 1.1 jtc # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1587 1.9 perry sopno copy;
1588 1.1 jtc
1589 1.14 lukem _DIAGASSERT(p != NULL);
1590 1.14 lukem
1591 1.39 christos if (p->error != 0) /* head off possible runaway recursion */
1592 1.30 christos return;
1593 1.30 christos
1594 1.1 jtc assert(from <= to);
1595 1.1 jtc
1596 1.1 jtc switch (REP(MAP(from), MAP(to))) {
1597 1.1 jtc case REP(0, 0): /* must be user doing this */
1598 1.1 jtc DROP(finish-start); /* drop the operand */
1599 1.1 jtc break;
1600 1.1 jtc case REP(0, 1): /* as x{1,1}? */
1601 1.1 jtc case REP(0, N): /* as x{1,n}? */
1602 1.1 jtc case REP(0, INF): /* as x{1,}? */
1603 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1604 1.4 jtc INSERT(OCH_, start); /* offset is wrong... */
1605 1.39 christos repeat(p, start+1, 1, to);
1606 1.4 jtc ASTERN(OOR1, start);
1607 1.1 jtc AHEAD(start); /* ... fix it */
1608 1.4 jtc EMIT(OOR2, 0);
1609 1.4 jtc AHEAD(THERE());
1610 1.4 jtc ASTERN(O_CH, THERETHERE());
1611 1.1 jtc break;
1612 1.1 jtc case REP(1, 1): /* trivial case */
1613 1.1 jtc /* done */
1614 1.1 jtc break;
1615 1.1 jtc case REP(1, N): /* as x?x{1,n-1} */
1616 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1617 1.4 jtc INSERT(OCH_, start);
1618 1.4 jtc ASTERN(OOR1, start);
1619 1.4 jtc AHEAD(start);
1620 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
1621 1.4 jtc AHEAD(THERE()); /* ...so fix it */
1622 1.4 jtc ASTERN(O_CH, THERETHERE());
1623 1.1 jtc copy = dupl(p, start+1, finish+1);
1624 1.4 jtc assert(copy == finish+4);
1625 1.39 christos repeat(p, copy, 1, to-1);
1626 1.1 jtc break;
1627 1.1 jtc case REP(1, INF): /* as x+ */
1628 1.1 jtc INSERT(OPLUS_, start);
1629 1.1 jtc ASTERN(O_PLUS, start);
1630 1.1 jtc break;
1631 1.1 jtc case REP(N, N): /* as xx{m-1,n-1} */
1632 1.1 jtc copy = dupl(p, start, finish);
1633 1.39 christos repeat(p, copy, from-1, to-1);
1634 1.1 jtc break;
1635 1.1 jtc case REP(N, INF): /* as xx{n-1,INF} */
1636 1.1 jtc copy = dupl(p, start, finish);
1637 1.39 christos repeat(p, copy, from-1, to);
1638 1.1 jtc break;
1639 1.1 jtc default: /* "can't happen" */
1640 1.1 jtc SETERROR(REG_ASSERT); /* just in case */
1641 1.1 jtc break;
1642 1.1 jtc }
1643 1.1 jtc }
1644 1.1 jtc
1645 1.1 jtc /*
1646 1.39 christos - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1647 1.39 christos - character from the parse struct, signals a REG_ILLSEQ error if the
1648 1.39 christos - character can't be converted. Returns the number of bytes consumed.
1649 1.39 christos */
1650 1.39 christos static wint_t
1651 1.39 christos wgetnext(struct parse *p)
1652 1.39 christos {
1653 1.42 christos #ifdef NLS
1654 1.39 christos mbstate_t mbs;
1655 1.39 christos wchar_t wc;
1656 1.39 christos size_t n;
1657 1.39 christos
1658 1.39 christos memset(&mbs, 0, sizeof(mbs));
1659 1.40 christos n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1660 1.39 christos if (n == (size_t)-1 || n == (size_t)-2) {
1661 1.39 christos SETERROR(REG_ILLSEQ);
1662 1.39 christos return (0);
1663 1.39 christos }
1664 1.39 christos if (n == 0)
1665 1.39 christos n = 1;
1666 1.39 christos p->next += n;
1667 1.42 christos return wc;
1668 1.42 christos #else
1669 1.42 christos return *p->next++;
1670 1.42 christos #endif
1671 1.39 christos }
1672 1.39 christos
1673 1.39 christos /*
1674 1.1 jtc - seterr - set an error condition
1675 1.9 perry == static int seterr(struct parse *p, int e);
1676 1.1 jtc */
1677 1.1 jtc static int /* useless but makes type checking happy */
1678 1.39 christos seterr(struct parse *p, int e)
1679 1.1 jtc {
1680 1.14 lukem
1681 1.14 lukem _DIAGASSERT(p != NULL);
1682 1.14 lukem
1683 1.1 jtc if (p->error == 0) /* keep earliest error condition */
1684 1.1 jtc p->error = e;
1685 1.1 jtc p->next = nuls; /* try to bring things to a halt */
1686 1.1 jtc p->end = nuls;
1687 1.1 jtc return(0); /* make the return value well-defined */
1688 1.1 jtc }
1689 1.1 jtc
1690 1.1 jtc /*
1691 1.1 jtc - allocset - allocate a set of characters for []
1692 1.9 perry == static cset *allocset(struct parse *p);
1693 1.1 jtc */
1694 1.1 jtc static cset *
1695 1.39 christos allocset(struct parse *p)
1696 1.1 jtc {
1697 1.39 christos cset *cs, *ncs;
1698 1.1 jtc
1699 1.14 lukem _DIAGASSERT(p != NULL);
1700 1.14 lukem
1701 1.39 christos ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1702 1.39 christos if (ncs == NULL) {
1703 1.39 christos SETERROR(REG_ESPACE);
1704 1.39 christos return (NULL);
1705 1.1 jtc }
1706 1.39 christos p->g->sets = ncs;
1707 1.39 christos cs = &p->g->sets[p->g->ncsets++];
1708 1.39 christos memset(cs, 0, sizeof(*cs));
1709 1.1 jtc
1710 1.1 jtc return(cs);
1711 1.1 jtc }
1712 1.1 jtc
1713 1.1 jtc /*
1714 1.1 jtc - freeset - free a now-unused set
1715 1.9 perry == static void freeset(struct parse *p, cset *cs);
1716 1.1 jtc */
1717 1.1 jtc static void
1718 1.39 christos freeset(struct parse *p, cset *cs)
1719 1.1 jtc {
1720 1.14 lukem cset *top;
1721 1.14 lukem
1722 1.14 lukem _DIAGASSERT(p != NULL);
1723 1.14 lukem _DIAGASSERT(cs != NULL);
1724 1.14 lukem
1725 1.14 lukem top = &p->g->sets[p->g->ncsets];
1726 1.1 jtc
1727 1.39 christos free(cs->wides);
1728 1.39 christos free(cs->ranges);
1729 1.39 christos free(cs->types);
1730 1.39 christos memset(cs, 0, sizeof(*cs));
1731 1.1 jtc if (cs == top-1) /* recover only the easy case */
1732 1.1 jtc p->g->ncsets--;
1733 1.1 jtc }
1734 1.1 jtc
1735 1.1 jtc /*
1736 1.39 christos - singleton - Determine whether a set contains only one character,
1737 1.39 christos - returning it if so, otherwise returning OUT.
1738 1.1 jtc */
1739 1.39 christos static wint_t
1740 1.39 christos singleton(cset *cs)
1741 1.1 jtc {
1742 1.39 christos wint_t i, s, n;
1743 1.1 jtc
1744 1.39 christos for (i = n = 0; i < NC; i++)
1745 1.39 christos if (CHIN(cs, i)) {
1746 1.1 jtc n++;
1747 1.39 christos s = i;
1748 1.39 christos }
1749 1.39 christos if (n == 1)
1750 1.39 christos return (s);
1751 1.39 christos if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1752 1.39 christos cs->icase == 0)
1753 1.39 christos return (cs->wides[0]);
1754 1.39 christos /* Don't bother handling the other cases. */
1755 1.39 christos return (OUT);
1756 1.1 jtc }
1757 1.1 jtc
1758 1.1 jtc /*
1759 1.39 christos - CHadd - add character to character set.
1760 1.1 jtc */
1761 1.1 jtc static void
1762 1.39 christos CHadd(struct parse *p, cset *cs, wint_t ch)
1763 1.1 jtc {
1764 1.39 christos wint_t nch, *newwides;
1765 1.14 lukem
1766 1.14 lukem _DIAGASSERT(p != NULL);
1767 1.14 lukem _DIAGASSERT(cs != NULL);
1768 1.14 lukem
1769 1.39 christos assert(ch >= 0);
1770 1.39 christos if (ch < NC)
1771 1.40 christos cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1772 1.39 christos else {
1773 1.39 christos newwides = reallocarray(cs->wides, cs->nwides + 1,
1774 1.39 christos sizeof(*cs->wides));
1775 1.39 christos if (newwides == NULL) {
1776 1.39 christos SETERROR(REG_ESPACE);
1777 1.39 christos return;
1778 1.39 christos }
1779 1.39 christos cs->wides = newwides;
1780 1.39 christos cs->wides[cs->nwides++] = ch;
1781 1.39 christos }
1782 1.39 christos if (cs->icase) {
1783 1.39 christos if ((nch = towlower(ch)) < NC)
1784 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1785 1.39 christos if ((nch = towupper(ch)) < NC)
1786 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1787 1.1 jtc }
1788 1.1 jtc }
1789 1.1 jtc
1790 1.1 jtc /*
1791 1.39 christos - CHaddrange - add all characters in the range [min,max] to a character set.
1792 1.1 jtc */
1793 1.1 jtc static void
1794 1.39 christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1795 1.1 jtc {
1796 1.39 christos crange *newranges;
1797 1.14 lukem
1798 1.39 christos _DIAGASSERT(p != NULL);
1799 1.14 lukem _DIAGASSERT(cs != NULL);
1800 1.14 lukem
1801 1.39 christos for (; min < NC && min <= max; min++)
1802 1.39 christos CHadd(p, cs, min);
1803 1.39 christos if (min >= max)
1804 1.39 christos return;
1805 1.39 christos newranges = reallocarray(cs->ranges, cs->nranges + 1,
1806 1.39 christos sizeof(*cs->ranges));
1807 1.39 christos if (newranges == NULL) {
1808 1.39 christos SETERROR(REG_ESPACE);
1809 1.1 jtc return;
1810 1.1 jtc }
1811 1.39 christos cs->ranges = newranges;
1812 1.39 christos cs->ranges[cs->nranges].min = min;
1813 1.39 christos cs->ranges[cs->nranges].max = max;
1814 1.39 christos cs->nranges++;
1815 1.1 jtc }
1816 1.1 jtc
1817 1.1 jtc /*
1818 1.39 christos - CHaddtype - add all characters of a certain type to a character set.
1819 1.1 jtc */
1820 1.1 jtc static void
1821 1.39 christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1822 1.1 jtc {
1823 1.39 christos wint_t i;
1824 1.39 christos wctype_t *newtypes;
1825 1.14 lukem
1826 1.14 lukem _DIAGASSERT(p != NULL);
1827 1.14 lukem _DIAGASSERT(cs != NULL);
1828 1.14 lukem
1829 1.39 christos for (i = 0; i < NC; i++)
1830 1.39 christos if (iswctype(i, wct))
1831 1.39 christos CHadd(p, cs, i);
1832 1.39 christos newtypes = reallocarray(cs->types, cs->ntypes + 1,
1833 1.39 christos sizeof(*cs->types));
1834 1.39 christos if (newtypes == NULL) {
1835 1.39 christos SETERROR(REG_ESPACE);
1836 1.1 jtc return;
1837 1.39 christos }
1838 1.39 christos cs->types = newtypes;
1839 1.39 christos cs->types[cs->ntypes++] = wct;
1840 1.1 jtc }
1841 1.1 jtc
1842 1.1 jtc /*
1843 1.1 jtc - dupl - emit a duplicate of a bunch of sops
1844 1.9 perry == static sopno dupl(struct parse *p, sopno start, sopno finish);
1845 1.1 jtc */
1846 1.1 jtc static sopno /* start of duplicate */
1847 1.39 christos dupl(struct parse *p,
1848 1.39 christos sopno start, /* from here */
1849 1.39 christos sopno finish) /* to this less one */
1850 1.1 jtc {
1851 1.39 christos sopno ret = HERE();
1852 1.9 perry sopno len = finish - start;
1853 1.1 jtc
1854 1.14 lukem _DIAGASSERT(p != NULL);
1855 1.14 lukem
1856 1.1 jtc assert(finish >= start);
1857 1.1 jtc if (len == 0)
1858 1.1 jtc return(ret);
1859 1.39 christos if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1860 1.39 christos return(ret);
1861 1.40 christos (void) memcpy(p->strip + p->slen,
1862 1.40 christos p->strip + start, len * sizeof(*p->strip));
1863 1.1 jtc p->slen += len;
1864 1.1 jtc return(ret);
1865 1.1 jtc }
1866 1.1 jtc
1867 1.1 jtc /*
1868 1.1 jtc - doemit - emit a strip operator
1869 1.9 perry == static void doemit(struct parse *p, sop op, size_t opnd);
1870 1.1 jtc *
1871 1.1 jtc * It might seem better to implement this as a macro with a function as
1872 1.1 jtc * hard-case backup, but it's just too big and messy unless there are
1873 1.1 jtc * some changes to the data structures. Maybe later.
1874 1.1 jtc */
1875 1.1 jtc static void
1876 1.39 christos doemit(struct parse *p, sop op, size_t opnd)
1877 1.1 jtc {
1878 1.1 jtc /* avoid making error situations worse */
1879 1.1 jtc if (p->error != 0)
1880 1.1 jtc return;
1881 1.1 jtc
1882 1.39 christos _DIAGASSERT(p != NULL);
1883 1.39 christos
1884 1.1 jtc /* deal with oversize operands ("can't happen", more or less) */
1885 1.1 jtc assert(opnd < 1<<OPSHIFT);
1886 1.1 jtc
1887 1.1 jtc /* deal with undersized strip */
1888 1.1 jtc if (p->slen >= p->ssize)
1889 1.30 christos if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1890 1.30 christos return;
1891 1.1 jtc
1892 1.1 jtc /* finally, it's all reduced to the easy case */
1893 1.40 christos p->strip[p->slen++] = (sopno)SOP(op, opnd);
1894 1.1 jtc }
1895 1.1 jtc
1896 1.1 jtc /*
1897 1.1 jtc - doinsert - insert a sop into the strip
1898 1.9 perry == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1899 1.1 jtc */
1900 1.1 jtc static void
1901 1.39 christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1902 1.1 jtc {
1903 1.9 perry sopno sn;
1904 1.9 perry sop s;
1905 1.9 perry int i;
1906 1.1 jtc
1907 1.14 lukem _DIAGASSERT(p != NULL);
1908 1.14 lukem
1909 1.1 jtc /* avoid making error situations worse */
1910 1.1 jtc if (p->error != 0)
1911 1.1 jtc return;
1912 1.1 jtc
1913 1.1 jtc sn = HERE();
1914 1.1 jtc EMIT(op, opnd); /* do checks, ensure space */
1915 1.1 jtc assert(HERE() == sn+1);
1916 1.1 jtc s = p->strip[sn];
1917 1.1 jtc
1918 1.1 jtc /* adjust paren pointers */
1919 1.1 jtc assert(pos > 0);
1920 1.1 jtc for (i = 1; i < NPAREN; i++) {
1921 1.1 jtc if (p->pbegin[i] >= pos) {
1922 1.1 jtc p->pbegin[i]++;
1923 1.1 jtc }
1924 1.1 jtc if (p->pend[i] >= pos) {
1925 1.1 jtc p->pend[i]++;
1926 1.1 jtc }
1927 1.1 jtc }
1928 1.1 jtc
1929 1.40 christos memmove(&p->strip[pos+1], &p->strip[pos],
1930 1.40 christos (HERE()-pos-1)*sizeof(*p->strip));
1931 1.1 jtc p->strip[pos] = s;
1932 1.1 jtc }
1933 1.1 jtc
1934 1.1 jtc /*
1935 1.1 jtc - dofwd - complete a forward reference
1936 1.9 perry == static void dofwd(struct parse *p, sopno pos, sop value);
1937 1.1 jtc */
1938 1.1 jtc static void
1939 1.39 christos dofwd(struct parse *p, sopno pos, sop value)
1940 1.1 jtc {
1941 1.14 lukem
1942 1.14 lukem _DIAGASSERT(p != NULL);
1943 1.14 lukem
1944 1.1 jtc /* avoid making error situations worse */
1945 1.1 jtc if (p->error != 0)
1946 1.1 jtc return;
1947 1.1 jtc
1948 1.1 jtc assert(value < 1<<OPSHIFT);
1949 1.39 christos p->strip[pos] = OP(p->strip[pos]) | value;
1950 1.1 jtc }
1951 1.1 jtc
1952 1.1 jtc /*
1953 1.1 jtc - enlarge - enlarge the strip
1954 1.39 christos == static int enlarge(struct parse *p, sopno size);
1955 1.1 jtc */
1956 1.30 christos static int
1957 1.35 joerg enlarge(struct parse *p, sopno size)
1958 1.1 jtc {
1959 1.39 christos sop *sp;
1960 1.39 christos
1961 1.14 lukem _DIAGASSERT(p != NULL);
1962 1.14 lukem
1963 1.1 jtc if (p->ssize >= size)
1964 1.30 christos return 1;
1965 1.1 jtc
1966 1.40 christos sp = reallocarray(p->strip, size, sizeof(*p->strip));
1967 1.39 christos if (sp == NULL) {
1968 1.1 jtc SETERROR(REG_ESPACE);
1969 1.30 christos return 0;
1970 1.1 jtc }
1971 1.39 christos p->strip = sp;
1972 1.35 joerg p->ssize = size;
1973 1.30 christos return 1;
1974 1.1 jtc }
1975 1.1 jtc
1976 1.1 jtc /*
1977 1.1 jtc - stripsnug - compact the strip
1978 1.9 perry == static void stripsnug(struct parse *p, struct re_guts *g);
1979 1.1 jtc */
1980 1.1 jtc static void
1981 1.39 christos stripsnug(struct parse *p, struct re_guts *g)
1982 1.1 jtc {
1983 1.14 lukem
1984 1.14 lukem _DIAGASSERT(p != NULL);
1985 1.14 lukem _DIAGASSERT(g != NULL);
1986 1.14 lukem
1987 1.1 jtc g->nstates = p->slen;
1988 1.40 christos g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1989 1.39 christos if (g->strip == NULL) {
1990 1.39 christos SETERROR(REG_ESPACE);
1991 1.39 christos g->strip = p->strip;
1992 1.39 christos }
1993 1.1 jtc }
1994 1.1 jtc
1995 1.1 jtc /*
1996 1.1 jtc - findmust - fill in must and mlen with longest mandatory literal string
1997 1.9 perry == static void findmust(struct parse *p, struct re_guts *g);
1998 1.1 jtc *
1999 1.1 jtc * This algorithm could do fancy things like analyzing the operands of |
2000 1.1 jtc * for common subsequences. Someday. This code is simple and finds most
2001 1.1 jtc * of the interesting cases.
2002 1.1 jtc *
2003 1.1 jtc * Note that must and mlen got initialized during setup.
2004 1.1 jtc */
2005 1.1 jtc static void
2006 1.39 christos findmust(struct parse *p, struct re_guts *g)
2007 1.1 jtc {
2008 1.9 perry sop *scan;
2009 1.7 christos sop *start = NULL;
2010 1.9 perry sop *newstart = NULL;
2011 1.9 perry sopno newlen;
2012 1.9 perry sop s;
2013 1.9 perry char *cp;
2014 1.39 christos int offset;
2015 1.39 christos mbstate_t mbs;
2016 1.1 jtc
2017 1.14 lukem _DIAGASSERT(p != NULL);
2018 1.14 lukem _DIAGASSERT(g != NULL);
2019 1.14 lukem
2020 1.1 jtc /* avoid making error situations worse */
2021 1.1 jtc if (p->error != 0)
2022 1.1 jtc return;
2023 1.1 jtc
2024 1.39 christos #ifdef notyet
2025 1.39 christos /*
2026 1.39 christos * It's not generally safe to do a ``char'' substring search on
2027 1.39 christos * multibyte character strings, but it's safe for at least
2028 1.39 christos * UTF-8 (see RFC 3629).
2029 1.39 christos */
2030 1.39 christos if (MB_CUR_MAX > 1 &&
2031 1.39 christos strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
2032 1.39 christos return;
2033 1.39 christos #endif
2034 1.39 christos
2035 1.1 jtc /* find the longest OCHAR sequence in strip */
2036 1.1 jtc newlen = 0;
2037 1.39 christos offset = 0;
2038 1.39 christos g->moffset = 0;
2039 1.1 jtc scan = g->strip + 1;
2040 1.1 jtc do {
2041 1.1 jtc s = *scan++;
2042 1.1 jtc switch (OP(s)) {
2043 1.1 jtc case OCHAR: /* sequence member */
2044 1.39 christos if (newlen == 0) { /* new sequence */
2045 1.39 christos memset(&mbs, 0, sizeof(mbs));
2046 1.1 jtc newstart = scan - 1;
2047 1.39 christos }
2048 1.42 christos #ifdef NLS
2049 1.42 christos char buf[MB_LEN_MAX];
2050 1.42 christos size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2051 1.39 christos if (clen == (size_t)-1)
2052 1.39 christos goto toohard;
2053 1.40 christos newlen += (sopno)clen;
2054 1.42 christos #else
2055 1.42 christos newlen++;
2056 1.42 christos #endif
2057 1.1 jtc break;
2058 1.1 jtc case OPLUS_: /* things that don't break one */
2059 1.1 jtc case OLPAREN:
2060 1.1 jtc case ORPAREN:
2061 1.1 jtc break;
2062 1.1 jtc case OQUEST_: /* things that must be skipped */
2063 1.1 jtc case OCH_:
2064 1.39 christos offset = altoffset(scan, offset);
2065 1.1 jtc scan--;
2066 1.1 jtc do {
2067 1.1 jtc scan += OPND(s);
2068 1.1 jtc s = *scan;
2069 1.1 jtc /* assert() interferes w debug printouts */
2070 1.40 christos if (OP(s) != O_QUEST &&
2071 1.40 christos OP(s) != O_CH && OP(s) != OOR2) {
2072 1.1 jtc g->iflags |= BAD;
2073 1.1 jtc return;
2074 1.1 jtc }
2075 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2076 1.11 christos /* FALLTHROUGH */
2077 1.39 christos case OBOW: /* things that break a sequence */
2078 1.39 christos case OEOW:
2079 1.39 christos case OBOL:
2080 1.39 christos case OEOL:
2081 1.39 christos case OBOS:
2082 1.39 christos case OEOS:
2083 1.39 christos case OWBND:
2084 1.39 christos case ONWBND:
2085 1.39 christos case O_QUEST:
2086 1.39 christos case O_CH:
2087 1.39 christos case OEND:
2088 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2089 1.1 jtc start = newstart;
2090 1.1 jtc g->mlen = newlen;
2091 1.39 christos if (offset > -1) {
2092 1.39 christos g->moffset += offset;
2093 1.39 christos offset = newlen;
2094 1.39 christos } else
2095 1.39 christos g->moffset = offset;
2096 1.39 christos } else {
2097 1.39 christos if (offset > -1)
2098 1.39 christos offset += newlen;
2099 1.1 jtc }
2100 1.1 jtc newlen = 0;
2101 1.1 jtc break;
2102 1.39 christos case OANY:
2103 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2104 1.39 christos start = newstart;
2105 1.39 christos g->mlen = newlen;
2106 1.39 christos if (offset > -1) {
2107 1.39 christos g->moffset += offset;
2108 1.39 christos offset = newlen;
2109 1.39 christos } else
2110 1.39 christos g->moffset = offset;
2111 1.39 christos } else {
2112 1.39 christos if (offset > -1)
2113 1.39 christos offset += newlen;
2114 1.39 christos }
2115 1.39 christos if (offset > -1)
2116 1.39 christos offset++;
2117 1.39 christos newlen = 0;
2118 1.39 christos break;
2119 1.39 christos case OANYOF: /* may or may not invalidate offset */
2120 1.39 christos /* First, everything as OANY */
2121 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2122 1.39 christos start = newstart;
2123 1.39 christos g->mlen = newlen;
2124 1.39 christos if (offset > -1) {
2125 1.39 christos g->moffset += offset;
2126 1.39 christos offset = newlen;
2127 1.39 christos } else
2128 1.39 christos g->moffset = offset;
2129 1.39 christos } else {
2130 1.39 christos if (offset > -1)
2131 1.39 christos offset += newlen;
2132 1.39 christos }
2133 1.39 christos if (offset > -1)
2134 1.39 christos offset++;
2135 1.39 christos newlen = 0;
2136 1.39 christos break;
2137 1.42 christos #ifdef NLS
2138 1.40 christos toohard:/*FALLTHROUGH*/
2139 1.42 christos #endif
2140 1.39 christos default:
2141 1.39 christos /* Anything here makes it impossible or too hard
2142 1.39 christos * to calculate the offset -- so we give up;
2143 1.39 christos * save the last known good offset, in case the
2144 1.39 christos * must sequence doesn't occur later.
2145 1.39 christos */
2146 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2147 1.39 christos start = newstart;
2148 1.39 christos g->mlen = newlen;
2149 1.39 christos if (offset > -1)
2150 1.39 christos g->moffset += offset;
2151 1.39 christos else
2152 1.39 christos g->moffset = offset;
2153 1.39 christos }
2154 1.39 christos offset = -1;
2155 1.39 christos newlen = 0;
2156 1.39 christos break;
2157 1.1 jtc }
2158 1.1 jtc } while (OP(s) != OEND);
2159 1.1 jtc
2160 1.39 christos if (g->mlen == 0) { /* there isn't one */
2161 1.39 christos g->moffset = -1;
2162 1.1 jtc return;
2163 1.39 christos }
2164 1.1 jtc
2165 1.1 jtc /* turn it into a character string */
2166 1.1 jtc g->must = malloc((size_t)g->mlen + 1);
2167 1.1 jtc if (g->must == NULL) { /* argh; just forget it */
2168 1.1 jtc g->mlen = 0;
2169 1.39 christos g->moffset = -1;
2170 1.1 jtc return;
2171 1.1 jtc }
2172 1.1 jtc cp = g->must;
2173 1.1 jtc scan = start;
2174 1.39 christos memset(&mbs, 0, sizeof(mbs));
2175 1.39 christos while (cp < g->must + g->mlen) {
2176 1.1 jtc while (OP(s = *scan++) != OCHAR)
2177 1.1 jtc continue;
2178 1.42 christos #ifdef NLS
2179 1.43 christos size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2180 1.39 christos assert(clen != (size_t)-1);
2181 1.39 christos cp += clen;
2182 1.42 christos #else
2183 1.42 christos *cp++ = OPND(s);
2184 1.42 christos #endif
2185 1.1 jtc }
2186 1.2 jtc assert(cp == g->must + g->mlen);
2187 1.1 jtc *cp++ = '\0'; /* just on general principles */
2188 1.1 jtc }
2189 1.1 jtc
2190 1.1 jtc /*
2191 1.39 christos - altoffset - choose biggest offset among multiple choices
2192 1.39 christos == static int altoffset(sop *scan, int offset);
2193 1.39 christos *
2194 1.39 christos * Compute, recursively if necessary, the largest offset among multiple
2195 1.39 christos * re paths.
2196 1.39 christos */
2197 1.39 christos static int
2198 1.39 christos altoffset(sop *scan, int offset)
2199 1.39 christos {
2200 1.39 christos int largest;
2201 1.39 christos int try;
2202 1.39 christos sop s;
2203 1.39 christos
2204 1.39 christos _DIAGASSERT(scan != NULL);
2205 1.39 christos
2206 1.39 christos /* If we gave up already on offsets, return */
2207 1.39 christos if (offset == -1)
2208 1.39 christos return -1;
2209 1.39 christos
2210 1.39 christos largest = 0;
2211 1.39 christos try = 0;
2212 1.39 christos s = *scan++;
2213 1.40 christos while (OP(s) != O_QUEST && OP(s) != O_CH) {
2214 1.39 christos switch (OP(s)) {
2215 1.39 christos case OOR1:
2216 1.39 christos if (try > largest)
2217 1.39 christos largest = try;
2218 1.39 christos try = 0;
2219 1.39 christos break;
2220 1.39 christos case OQUEST_:
2221 1.39 christos case OCH_:
2222 1.39 christos try = altoffset(scan, try);
2223 1.39 christos if (try == -1)
2224 1.39 christos return -1;
2225 1.39 christos scan--;
2226 1.39 christos do {
2227 1.39 christos scan += OPND(s);
2228 1.39 christos s = *scan;
2229 1.40 christos if (OP(s) != O_QUEST &&
2230 1.40 christos OP(s) != O_CH && OP(s) != OOR2)
2231 1.39 christos return -1;
2232 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2233 1.39 christos /* We must skip to the next position, or we'll
2234 1.39 christos * leave altoffset() too early.
2235 1.39 christos */
2236 1.39 christos scan++;
2237 1.39 christos break;
2238 1.39 christos case OANYOF:
2239 1.39 christos case OCHAR:
2240 1.39 christos case OANY:
2241 1.39 christos try++;
2242 1.40 christos /*FALLTHROUGH*/
2243 1.39 christos case OBOW:
2244 1.39 christos case OEOW:
2245 1.39 christos case OWBND:
2246 1.39 christos case ONWBND:
2247 1.39 christos case OLPAREN:
2248 1.39 christos case ORPAREN:
2249 1.39 christos case OOR2:
2250 1.39 christos break;
2251 1.39 christos default:
2252 1.39 christos try = -1;
2253 1.39 christos break;
2254 1.39 christos }
2255 1.39 christos if (try == -1)
2256 1.39 christos return -1;
2257 1.39 christos s = *scan++;
2258 1.39 christos }
2259 1.39 christos
2260 1.39 christos if (try > largest)
2261 1.39 christos largest = try;
2262 1.39 christos
2263 1.39 christos return largest+offset;
2264 1.39 christos }
2265 1.39 christos
2266 1.39 christos /*
2267 1.39 christos - computejumps - compute char jumps for BM scan
2268 1.39 christos == static void computejumps(struct parse *p, struct re_guts *g);
2269 1.39 christos *
2270 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2271 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2272 1.39 christos * Sara Baase.
2273 1.39 christos *
2274 1.39 christos * A char jump is the number of characters one needs to jump based on
2275 1.39 christos * the value of the character from the text that was mismatched.
2276 1.39 christos */
2277 1.39 christos static void
2278 1.39 christos computejumps(struct parse *p, struct re_guts *g)
2279 1.39 christos {
2280 1.39 christos int ch;
2281 1.39 christos size_t mindex;
2282 1.39 christos
2283 1.39 christos _DIAGASSERT(p != NULL);
2284 1.39 christos _DIAGASSERT(g != NULL);
2285 1.39 christos
2286 1.39 christos /* Avoid making errors worse */
2287 1.39 christos if (p->error != 0)
2288 1.39 christos return;
2289 1.39 christos
2290 1.39 christos g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2291 1.39 christos if (g->charjump == NULL) /* Not a fatal error */
2292 1.39 christos return;
2293 1.39 christos /* Adjust for signed chars, if necessary */
2294 1.39 christos g->charjump = &g->charjump[-(CHAR_MIN)];
2295 1.39 christos
2296 1.39 christos /* If the character does not exist in the pattern, the jump
2297 1.39 christos * is equal to the number of characters in the pattern.
2298 1.39 christos */
2299 1.39 christos for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2300 1.39 christos g->charjump[ch] = g->mlen;
2301 1.39 christos
2302 1.39 christos /* If the character does exist, compute the jump that would
2303 1.39 christos * take us to the last character in the pattern equal to it
2304 1.39 christos * (notice that we match right to left, so that last character
2305 1.39 christos * is the first one that would be matched).
2306 1.39 christos */
2307 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2308 1.39 christos g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2309 1.39 christos }
2310 1.39 christos
2311 1.39 christos /*
2312 1.39 christos - computematchjumps - compute match jumps for BM scan
2313 1.39 christos == static void computematchjumps(struct parse *p, struct re_guts *g);
2314 1.39 christos *
2315 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2316 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2317 1.39 christos * Sara Baase.
2318 1.39 christos *
2319 1.39 christos * A match jump is the number of characters one needs to advance based
2320 1.39 christos * on the already-matched suffix.
2321 1.39 christos * Notice that all values here are minus (g->mlen-1), because of the way
2322 1.39 christos * the search algorithm works.
2323 1.39 christos */
2324 1.39 christos static void
2325 1.39 christos computematchjumps(struct parse *p, struct re_guts *g)
2326 1.39 christos {
2327 1.39 christos size_t mindex; /* General "must" iterator */
2328 1.39 christos size_t suffix; /* Keeps track of matching suffix */
2329 1.39 christos size_t ssuffix; /* Keeps track of suffixes' suffix */
2330 1.39 christos size_t* pmatches; /* pmatches[k] points to the next i
2331 1.39 christos * such that i+1...mlen is a substring
2332 1.39 christos * of k+1...k+mlen-i-1
2333 1.39 christos */
2334 1.39 christos
2335 1.39 christos _DIAGASSERT(p != NULL);
2336 1.39 christos _DIAGASSERT(g != NULL);
2337 1.39 christos
2338 1.39 christos /* Avoid making errors worse */
2339 1.39 christos if (p->error != 0)
2340 1.39 christos return;
2341 1.39 christos
2342 1.39 christos pmatches = calloc(g->mlen, sizeof(*pmatches));
2343 1.39 christos if (pmatches == NULL) {
2344 1.39 christos g->matchjump = NULL;
2345 1.39 christos return;
2346 1.39 christos }
2347 1.39 christos
2348 1.39 christos g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2349 1.39 christos if (g->matchjump == NULL) { /* Not a fatal error */
2350 1.39 christos free(pmatches);
2351 1.39 christos return;
2352 1.39 christos }
2353 1.39 christos
2354 1.39 christos /* Set maximum possible jump for each character in the pattern */
2355 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2356 1.39 christos g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2357 1.39 christos
2358 1.39 christos /* Compute pmatches[] */
2359 1.39 christos for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2360 1.39 christos pmatches[mindex] = suffix;
2361 1.39 christos
2362 1.39 christos /* If a mismatch is found, interrupting the substring,
2363 1.39 christos * compute the matchjump for that position. If no
2364 1.39 christos * mismatch is found, then a text substring mismatched
2365 1.39 christos * against the suffix will also mismatch against the
2366 1.39 christos * substring.
2367 1.39 christos */
2368 1.39 christos while (suffix < g->mlen
2369 1.39 christos && g->must[mindex] != g->must[suffix]) {
2370 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2371 1.39 christos g->mlen - mindex - 1);
2372 1.39 christos suffix = pmatches[suffix];
2373 1.39 christos }
2374 1.39 christos }
2375 1.39 christos
2376 1.39 christos /* Compute the matchjump up to the last substring found to jump
2377 1.39 christos * to the beginning of the largest must pattern prefix matching
2378 1.39 christos * it's own suffix.
2379 1.39 christos */
2380 1.39 christos for (mindex = 0; mindex <= suffix; mindex++)
2381 1.39 christos g->matchjump[mindex] = MIN(g->matchjump[mindex],
2382 1.39 christos g->mlen + suffix - mindex);
2383 1.39 christos
2384 1.39 christos ssuffix = pmatches[suffix];
2385 1.39 christos while (suffix < g->mlen) {
2386 1.39 christos while (suffix <= ssuffix && suffix < g->mlen) {
2387 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2388 1.39 christos g->mlen + ssuffix - suffix);
2389 1.39 christos suffix++;
2390 1.39 christos }
2391 1.39 christos if (suffix < g->mlen)
2392 1.39 christos ssuffix = pmatches[ssuffix];
2393 1.39 christos }
2394 1.39 christos
2395 1.39 christos free(pmatches);
2396 1.39 christos }
2397 1.39 christos
2398 1.39 christos /*
2399 1.1 jtc - pluscount - count + nesting
2400 1.9 perry == static sopno pluscount(struct parse *p, struct re_guts *g);
2401 1.1 jtc */
2402 1.1 jtc static sopno /* nesting depth */
2403 1.39 christos pluscount(struct parse *p, struct re_guts *g)
2404 1.1 jtc {
2405 1.9 perry sop *scan;
2406 1.9 perry sop s;
2407 1.9 perry sopno plusnest = 0;
2408 1.9 perry sopno maxnest = 0;
2409 1.14 lukem
2410 1.14 lukem _DIAGASSERT(p != NULL);
2411 1.14 lukem _DIAGASSERT(g != NULL);
2412 1.1 jtc
2413 1.1 jtc if (p->error != 0)
2414 1.1 jtc return(0); /* there may not be an OEND */
2415 1.1 jtc
2416 1.1 jtc scan = g->strip + 1;
2417 1.1 jtc do {
2418 1.1 jtc s = *scan++;
2419 1.1 jtc switch (OP(s)) {
2420 1.1 jtc case OPLUS_:
2421 1.1 jtc plusnest++;
2422 1.1 jtc break;
2423 1.1 jtc case O_PLUS:
2424 1.1 jtc if (plusnest > maxnest)
2425 1.1 jtc maxnest = plusnest;
2426 1.1 jtc plusnest--;
2427 1.1 jtc break;
2428 1.1 jtc }
2429 1.1 jtc } while (OP(s) != OEND);
2430 1.1 jtc if (plusnest != 0)
2431 1.1 jtc g->iflags |= BAD;
2432 1.1 jtc return(maxnest);
2433 1.1 jtc }
2434