regcomp.c revision 1.48 1 1.48 christos /* $NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $ */
2 1.6 cgd
3 1.5 cgd /*-
4 1.39 christos * SPDX-License-Identifier: BSD-3-Clause
5 1.39 christos *
6 1.39 christos * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7 1.5 cgd * Copyright (c) 1992, 1993, 1994
8 1.5 cgd * The Regents of the University of California. All rights reserved.
9 1.5 cgd *
10 1.39 christos * Copyright (c) 2011 The FreeBSD Foundation
11 1.39 christos * All rights reserved.
12 1.39 christos * Portions of this software were developed by David Chisnall
13 1.39 christos * under sponsorship from the FreeBSD Foundation.
14 1.39 christos *
15 1.5 cgd * This code is derived from software contributed to Berkeley by
16 1.5 cgd * Henry Spencer.
17 1.5 cgd *
18 1.5 cgd * Redistribution and use in source and binary forms, with or without
19 1.5 cgd * modification, are permitted provided that the following conditions
20 1.5 cgd * are met:
21 1.5 cgd * 1. Redistributions of source code must retain the above copyright
22 1.5 cgd * notice, this list of conditions and the following disclaimer.
23 1.5 cgd * 2. Redistributions in binary form must reproduce the above copyright
24 1.5 cgd * notice, this list of conditions and the following disclaimer in the
25 1.5 cgd * documentation and/or other materials provided with the distribution.
26 1.18 agc * 3. Neither the name of the University nor the names of its contributors
27 1.18 agc * may be used to endorse or promote products derived from this software
28 1.18 agc * without specific prior written permission.
29 1.18 agc *
30 1.18 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 1.18 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 1.18 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 1.18 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 1.18 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 1.18 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 1.18 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 1.18 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 1.18 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 1.18 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 1.18 agc * SUCH DAMAGE.
41 1.18 agc *
42 1.18 agc * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
43 1.18 agc */
44 1.18 agc
45 1.45 christos #if HAVE_NBTOOL_CONFIG_H
46 1.45 christos #include "nbtool_config.h"
47 1.45 christos #endif
48 1.45 christos
49 1.7 christos #include <sys/cdefs.h>
50 1.6 cgd #if 0
51 1.5 cgd static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
52 1.39 christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53 1.6 cgd #endif
54 1.48 christos __RCSID("$NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $");
55 1.42 christos
56 1.42 christos #ifndef LIBHACK
57 1.39 christos #define REGEX_GNU_EXTENSIONS
58 1.5 cgd
59 1.8 jtc #include "namespace.h"
60 1.42 christos #endif
61 1.1 jtc #include <sys/types.h>
62 1.39 christos #include <stdio.h>
63 1.39 christos #include <string.h>
64 1.1 jtc #include <ctype.h>
65 1.1 jtc #include <limits.h>
66 1.1 jtc #include <stdlib.h>
67 1.28 junyoung #include <regex.h>
68 1.39 christos #include <stdbool.h>
69 1.8 jtc
70 1.44 christos #if defined(__weak_alias) && !defined(LIBHACK)
71 1.16 mycroft __weak_alias(regcomp,_regcomp)
72 1.8 jtc #endif
73 1.1 jtc
74 1.39 christos #ifdef REGEX_LIBC_COLLATE
75 1.39 christos #include "collate.h"
76 1.39 christos #endif
77 1.39 christos
78 1.1 jtc #include "utils.h"
79 1.1 jtc #include "regex2.h"
80 1.1 jtc
81 1.1 jtc #include "cname.h"
82 1.1 jtc
83 1.1 jtc /*
84 1.39 christos * Branching context, used to keep track of branch state for all of the branch-
85 1.39 christos * aware functions. In addition to keeping track of branch positions for the
86 1.39 christos * p_branch_* functions, we use this to simplify some clumsiness in BREs for
87 1.39 christos * detection of whether ^ is acting as an anchor or being used erroneously and
88 1.39 christos * also for whether we're in a sub-expression or not.
89 1.39 christos */
90 1.39 christos struct branchc {
91 1.39 christos sopno start;
92 1.39 christos sopno back;
93 1.39 christos sopno fwd;
94 1.39 christos
95 1.39 christos int nbranch;
96 1.39 christos int nchain;
97 1.39 christos bool outer;
98 1.39 christos bool terminate;
99 1.39 christos };
100 1.39 christos
101 1.39 christos /*
102 1.1 jtc * parse structure, passed up and down to avoid global variables and
103 1.1 jtc * other clumsinesses
104 1.1 jtc */
105 1.1 jtc struct parse {
106 1.21 yamt const char *next; /* next character in RE */
107 1.21 yamt const char *end; /* end of string (-> NUL normally) */
108 1.1 jtc int error; /* has an error been seen? */
109 1.39 christos int gnuext;
110 1.1 jtc sop *strip; /* malloced strip */
111 1.1 jtc sopno ssize; /* malloced strip size (allocated) */
112 1.1 jtc sopno slen; /* malloced strip length (used) */
113 1.33 christos size_t ncsalloc; /* number of csets allocated */
114 1.1 jtc struct re_guts *g;
115 1.1 jtc # define NPAREN 10 /* we need to remember () 1-9 for back refs */
116 1.1 jtc sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
117 1.1 jtc sopno pend[NPAREN]; /* -> ) ([0] unused) */
118 1.39 christos bool allowbranch; /* can this expression branch? */
119 1.39 christos bool bre; /* convenience; is this a BRE? */
120 1.39 christos int pflags; /* other parsing flags -- legacy escapes? */
121 1.39 christos bool (*parse_expr)(struct parse *, struct branchc *);
122 1.39 christos void (*pre_parse)(struct parse *, struct branchc *);
123 1.39 christos void (*post_parse)(struct parse *, struct branchc *);
124 1.1 jtc };
125 1.1 jtc
126 1.39 christos #define PFLAG_LEGACY_ESC 0x00000001
127 1.39 christos
128 1.5 cgd /* ========= begin header generated by ./mkh ========= */
129 1.5 cgd #ifdef __cplusplus
130 1.5 cgd extern "C" {
131 1.5 cgd #endif
132 1.5 cgd
133 1.5 cgd /* === regcomp.c === */
134 1.39 christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
135 1.26 junyoung static void p_str(struct parse *p);
136 1.39 christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
137 1.39 christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
138 1.39 christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
139 1.39 christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
140 1.39 christos static bool p_branch_do(struct parse *p, struct branchc *bc);
141 1.39 christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
142 1.39 christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
143 1.39 christos static void p_re(struct parse *p, int end1, int end2);
144 1.39 christos static bool p_simp_re(struct parse *p, struct branchc *bc);
145 1.26 junyoung static int p_count(struct parse *p);
146 1.26 junyoung static void p_bracket(struct parse *p);
147 1.39 christos static int p_range_cmp(wchar_t c1, wchar_t c2);
148 1.26 junyoung static void p_b_term(struct parse *p, cset *cs);
149 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
150 1.39 christos static int p_b_pseudoclass(struct parse *p, char c);
151 1.39 christos #endif
152 1.26 junyoung static void p_b_cclass(struct parse *p, cset *cs);
153 1.39 christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
154 1.26 junyoung static void p_b_eclass(struct parse *p, cset *cs);
155 1.39 christos static wint_t p_b_symbol(struct parse *p);
156 1.39 christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
157 1.39 christos static bool may_escape(struct parse *p, const wint_t ch);
158 1.39 christos static wint_t othercase(wint_t ch);
159 1.39 christos static void bothcases(struct parse *p, wint_t ch);
160 1.39 christos static void ordinary(struct parse *p, wint_t ch);
161 1.26 junyoung static void nonnewline(struct parse *p);
162 1.39 christos static void repeat(struct parse *p, sopno start, int from, int to);
163 1.26 junyoung static int seterr(struct parse *p, int e);
164 1.26 junyoung static cset *allocset(struct parse *p);
165 1.26 junyoung static void freeset(struct parse *p, cset *cs);
166 1.39 christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
167 1.39 christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
168 1.39 christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
169 1.39 christos static wint_t singleton(cset *cs);
170 1.26 junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
171 1.39 christos static void doemit(struct parse *p, sop op, size_t opnd);
172 1.39 christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
173 1.39 christos static void dofwd(struct parse *p, sopno pos, sop value);
174 1.30 christos static int enlarge(struct parse *p, sopno size);
175 1.26 junyoung static void stripsnug(struct parse *p, struct re_guts *g);
176 1.26 junyoung static void findmust(struct parse *p, struct re_guts *g);
177 1.39 christos static int altoffset(sop *scan, int offset);
178 1.39 christos static void computejumps(struct parse *p, struct re_guts *g);
179 1.39 christos static void computematchjumps(struct parse *p, struct re_guts *g);
180 1.26 junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
181 1.39 christos static wint_t wgetnext(struct parse *p);
182 1.5 cgd
183 1.5 cgd #ifdef __cplusplus
184 1.5 cgd }
185 1.5 cgd #endif
186 1.5 cgd /* ========= end header generated by ./mkh ========= */
187 1.1 jtc
188 1.1 jtc static char nuls[10]; /* place to point scanner in event of error */
189 1.1 jtc
190 1.1 jtc /*
191 1.1 jtc * macros for use with parse structure
192 1.1 jtc * BEWARE: these know that the parse structure is named `p' !!!
193 1.1 jtc */
194 1.1 jtc #define PEEK() (*p->next)
195 1.1 jtc #define PEEK2() (*(p->next+1))
196 1.1 jtc #define MORE() (p->next < p->end)
197 1.1 jtc #define MORE2() (p->next+1 < p->end)
198 1.1 jtc #define SEE(c) (MORE() && PEEK() == (c))
199 1.1 jtc #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
200 1.39 christos #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
201 1.1 jtc #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
202 1.1 jtc #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
203 1.39 christos #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a))
204 1.1 jtc #define NEXT() (p->next++)
205 1.1 jtc #define NEXT2() (p->next += 2)
206 1.1 jtc #define NEXTn(n) (p->next += (n))
207 1.1 jtc #define GETNEXT() (*p->next++)
208 1.39 christos #define WGETNEXT() wgetnext(p)
209 1.1 jtc #define SETERROR(e) seterr(p, (e))
210 1.39 christos #define REQUIRE(co, e) ((co) || SETERROR(e))
211 1.1 jtc #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
212 1.39 christos #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
213 1.1 jtc #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
214 1.40 christos #define EMIT(op, sopnd) doemit(p, (op), (sopnd))
215 1.40 christos #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos)
216 1.12 drochner #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
217 1.1 jtc #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
218 1.1 jtc #define HERE() (p->slen)
219 1.1 jtc #define THERE() (p->slen - 1)
220 1.4 jtc #define THERETHERE() (p->slen - 2)
221 1.1 jtc #define DROP(n) (p->slen -= (n))
222 1.1 jtc
223 1.39 christos /* Macro used by computejump()/computematchjump() */
224 1.41 christos #ifndef MIN
225 1.39 christos #define MIN(a,b) ((a)<(b)?(a):(b))
226 1.41 christos #endif
227 1.30 christos
228 1.42 christos #ifndef NLS
229 1.42 christos static const struct {
230 1.42 christos const char *name;
231 1.42 christos int (*func)(int);
232 1.42 christos } wctypes[] = {
233 1.42 christos #define ADD(x) { .name = # x, .func = is ## x }
234 1.42 christos ADD(alnum),
235 1.42 christos ADD(alpha),
236 1.42 christos ADD(blank),
237 1.42 christos ADD(cntrl),
238 1.42 christos ADD(digit),
239 1.42 christos ADD(graph),
240 1.42 christos ADD(lower),
241 1.42 christos ADD(print),
242 1.42 christos ADD(punct),
243 1.42 christos ADD(space),
244 1.42 christos ADD(upper),
245 1.42 christos ADD(xdigit),
246 1.42 christos #undef ADD
247 1.42 christos };
248 1.42 christos
249 1.42 christos wctype_t
250 1.42 christos __regex_wctype(const char *str)
251 1.42 christos {
252 1.42 christos for (size_t i = 0; i < __arraycount(wctypes); i++) {
253 1.42 christos if (strcmp(wctypes[i].name, str) == 0)
254 1.42 christos return (wctype_t)(i + 1);
255 1.42 christos }
256 1.42 christos return (wctype_t)0;
257 1.42 christos }
258 1.42 christos
259 1.42 christos int
260 1.42 christos __regex_iswctype(wint_t c, wctype_t ct)
261 1.42 christos {
262 1.42 christos if (ct == 0)
263 1.42 christos return 0;
264 1.42 christos return (*wctypes[ct - 1].func)(c);
265 1.42 christos }
266 1.42 christos #endif
267 1.42 christos
268 1.39 christos static int /* 0 success, otherwise REG_something */
269 1.39 christos regcomp_internal(regex_t * __restrict preg,
270 1.39 christos const char * __restrict pattern,
271 1.39 christos int cflags, int pflags)
272 1.1 jtc {
273 1.1 jtc struct parse pa;
274 1.9 perry struct re_guts *g;
275 1.9 perry struct parse *p = &pa;
276 1.9 perry int i;
277 1.9 perry size_t len;
278 1.39 christos size_t maxlen;
279 1.3 jtc #ifdef REDEBUG
280 1.3 jtc # define GOODFLAGS(f) (f)
281 1.3 jtc #else
282 1.3 jtc # define GOODFLAGS(f) ((f)&~REG_DUMP)
283 1.3 jtc #endif
284 1.1 jtc
285 1.14 lukem _DIAGASSERT(preg != NULL);
286 1.14 lukem _DIAGASSERT(pattern != NULL);
287 1.14 lukem
288 1.3 jtc cflags = GOODFLAGS(cflags);
289 1.1 jtc if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
290 1.1 jtc return(REG_INVARG);
291 1.1 jtc
292 1.1 jtc if (cflags®_PEND) {
293 1.1 jtc if (preg->re_endp < pattern)
294 1.1 jtc return(REG_INVARG);
295 1.1 jtc len = preg->re_endp - pattern;
296 1.1 jtc } else
297 1.11 christos len = strlen(pattern);
298 1.1 jtc
299 1.1 jtc /* do the mallocs early so failure handling is easy */
300 1.39 christos g = malloc(sizeof(*g));
301 1.1 jtc if (g == NULL)
302 1.1 jtc return(REG_ESPACE);
303 1.39 christos /*
304 1.39 christos * Limit the pattern space to avoid a 32-bit overflow on buffer
305 1.39 christos * extension. Also avoid any signed overflow in case of conversion
306 1.39 christos * so make the real limit based on a 31-bit overflow.
307 1.39 christos *
308 1.39 christos * Likely not applicable on 64-bit systems but handle the case
309 1.39 christos * generically (who are we to stop people from using ~715MB+
310 1.39 christos * patterns?).
311 1.39 christos */
312 1.40 christos maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
313 1.39 christos if (len >= maxlen) {
314 1.40 christos free(g);
315 1.39 christos return(REG_ESPACE);
316 1.39 christos }
317 1.40 christos p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */
318 1.39 christos assert(p->ssize >= len);
319 1.39 christos
320 1.39 christos p->strip = calloc(p->ssize, sizeof(*p->strip));
321 1.1 jtc p->slen = 0;
322 1.1 jtc if (p->strip == NULL) {
323 1.40 christos free(g);
324 1.1 jtc return(REG_ESPACE);
325 1.1 jtc }
326 1.1 jtc
327 1.1 jtc /* set things up */
328 1.1 jtc p->g = g;
329 1.39 christos p->next = pattern; /* convenience; we do not modify it */
330 1.1 jtc p->end = p->next + len;
331 1.1 jtc p->error = 0;
332 1.1 jtc p->ncsalloc = 0;
333 1.39 christos p->pflags = pflags;
334 1.1 jtc for (i = 0; i < NPAREN; i++) {
335 1.1 jtc p->pbegin[i] = 0;
336 1.1 jtc p->pend[i] = 0;
337 1.1 jtc }
338 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
339 1.39 christos if ((cflags & REG_GNU) == 0) {
340 1.39 christos p->gnuext = false;
341 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
342 1.39 christos } else
343 1.39 christos p->gnuext = p->allowbranch = true;
344 1.39 christos #else
345 1.39 christos p->gnuext = false;
346 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0;
347 1.39 christos #endif
348 1.39 christos if (cflags & REG_EXTENDED) {
349 1.39 christos p->bre = false;
350 1.39 christos p->parse_expr = p_ere_exp;
351 1.39 christos p->pre_parse = NULL;
352 1.39 christos p->post_parse = NULL;
353 1.39 christos } else {
354 1.39 christos p->bre = true;
355 1.39 christos p->parse_expr = p_simp_re;
356 1.39 christos p->pre_parse = p_bre_pre_parse;
357 1.39 christos p->post_parse = p_bre_post_parse;
358 1.39 christos }
359 1.1 jtc g->sets = NULL;
360 1.1 jtc g->ncsets = 0;
361 1.1 jtc g->cflags = cflags;
362 1.1 jtc g->iflags = 0;
363 1.1 jtc g->nbol = 0;
364 1.1 jtc g->neol = 0;
365 1.1 jtc g->must = NULL;
366 1.39 christos g->moffset = -1;
367 1.39 christos g->charjump = NULL;
368 1.39 christos g->matchjump = NULL;
369 1.1 jtc g->mlen = 0;
370 1.1 jtc g->nsub = 0;
371 1.1 jtc g->backrefs = 0;
372 1.1 jtc
373 1.1 jtc /* do it */
374 1.1 jtc EMIT(OEND, 0);
375 1.1 jtc g->firststate = THERE();
376 1.39 christos if (cflags & REG_NOSPEC)
377 1.1 jtc p_str(p);
378 1.1 jtc else
379 1.39 christos p_re(p, OUT, OUT);
380 1.1 jtc EMIT(OEND, 0);
381 1.1 jtc g->laststate = THERE();
382 1.1 jtc
383 1.1 jtc /* tidy up loose ends and fill things in */
384 1.1 jtc stripsnug(p, g);
385 1.1 jtc findmust(p, g);
386 1.39 christos /* only use Boyer-Moore algorithm if the pattern is bigger
387 1.39 christos * than three characters
388 1.39 christos */
389 1.39 christos if(g->mlen > 3) {
390 1.39 christos computejumps(p, g);
391 1.39 christos computematchjumps(p, g);
392 1.39 christos if(g->matchjump == NULL && g->charjump != NULL) {
393 1.39 christos free(g->charjump);
394 1.39 christos g->charjump = NULL;
395 1.39 christos }
396 1.39 christos }
397 1.1 jtc g->nplus = pluscount(p, g);
398 1.1 jtc g->magic = MAGIC2;
399 1.1 jtc preg->re_nsub = g->nsub;
400 1.1 jtc preg->re_g = g;
401 1.1 jtc preg->re_magic = MAGIC1;
402 1.1 jtc #ifndef REDEBUG
403 1.1 jtc /* not debugging, so can't rely on the assert() in regexec() */
404 1.1 jtc if (g->iflags&BAD)
405 1.1 jtc SETERROR(REG_ASSERT);
406 1.1 jtc #endif
407 1.1 jtc
408 1.1 jtc /* win or lose, we're done */
409 1.1 jtc if (p->error != 0) /* lose */
410 1.1 jtc regfree(preg);
411 1.1 jtc return(p->error);
412 1.1 jtc }
413 1.1 jtc
414 1.1 jtc /*
415 1.39 christos - regcomp - interface for parser and compilation
416 1.39 christos = extern int regcomp(regex_t *, const char *, int);
417 1.39 christos = #define REG_BASIC 0000
418 1.39 christos = #define REG_EXTENDED 0001
419 1.39 christos = #define REG_ICASE 0002
420 1.39 christos = #define REG_NOSUB 0004
421 1.39 christos = #define REG_NEWLINE 0010
422 1.39 christos = #define REG_NOSPEC 0020
423 1.39 christos = #define REG_PEND 0040
424 1.39 christos = #define REG_DUMP 0200
425 1.1 jtc */
426 1.39 christos int /* 0 success, otherwise REG_something */
427 1.39 christos regcomp(regex_t * __restrict preg,
428 1.39 christos const char * __restrict pattern,
429 1.39 christos int cflags)
430 1.1 jtc {
431 1.30 christos
432 1.39 christos return (regcomp_internal(preg, pattern, cflags, 0));
433 1.1 jtc }
434 1.1 jtc
435 1.1 jtc /*
436 1.39 christos - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
437 1.39 christos - return whether we should terminate or not
438 1.39 christos == static bool p_ere_exp(struct parse *p);
439 1.1 jtc */
440 1.39 christos static bool
441 1.39 christos p_ere_exp(struct parse *p, struct branchc *bc)
442 1.1 jtc {
443 1.9 perry char c;
444 1.39 christos wint_t wc;
445 1.9 perry sopno pos;
446 1.9 perry int count;
447 1.9 perry int count2;
448 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
449 1.40 christos size_t i;
450 1.39 christos int handled;
451 1.39 christos #endif
452 1.9 perry sopno subno;
453 1.1 jtc int wascaret = 0;
454 1.1 jtc
455 1.14 lukem _DIAGASSERT(p != NULL);
456 1.14 lukem
457 1.39 christos (void)bc;
458 1.1 jtc assert(MORE()); /* caller should have ensured this */
459 1.1 jtc c = GETNEXT();
460 1.1 jtc
461 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
462 1.39 christos handled = 0;
463 1.39 christos #endif
464 1.1 jtc pos = HERE();
465 1.1 jtc switch (c) {
466 1.1 jtc case '(':
467 1.39 christos (void)REQUIRE(MORE(), REG_EPAREN);
468 1.1 jtc p->g->nsub++;
469 1.40 christos subno = (sopno)p->g->nsub;
470 1.1 jtc if (subno < NPAREN)
471 1.1 jtc p->pbegin[subno] = HERE();
472 1.1 jtc EMIT(OLPAREN, subno);
473 1.1 jtc if (!SEE(')'))
474 1.39 christos p_re(p, ')', IGN);
475 1.1 jtc if (subno < NPAREN) {
476 1.1 jtc p->pend[subno] = HERE();
477 1.1 jtc assert(p->pend[subno] != 0);
478 1.1 jtc }
479 1.1 jtc EMIT(ORPAREN, subno);
480 1.39 christos (void)MUSTEAT(')', REG_EPAREN);
481 1.1 jtc break;
482 1.1 jtc #ifndef POSIX_MISTAKE
483 1.1 jtc case ')': /* happens only if no current unmatched ( */
484 1.1 jtc /*
485 1.1 jtc * You may ask, why the ifndef? Because I didn't notice
486 1.1 jtc * this until slightly too late for 1003.2, and none of the
487 1.1 jtc * other 1003.2 regular-expression reviewers noticed it at
488 1.1 jtc * all. So an unmatched ) is legal POSIX, at least until
489 1.1 jtc * we can get it fixed.
490 1.1 jtc */
491 1.1 jtc SETERROR(REG_EPAREN);
492 1.1 jtc break;
493 1.1 jtc #endif
494 1.1 jtc case '^':
495 1.1 jtc EMIT(OBOL, 0);
496 1.1 jtc p->g->iflags |= USEBOL;
497 1.1 jtc p->g->nbol++;
498 1.1 jtc wascaret = 1;
499 1.1 jtc break;
500 1.1 jtc case '$':
501 1.1 jtc EMIT(OEOL, 0);
502 1.1 jtc p->g->iflags |= USEEOL;
503 1.1 jtc p->g->neol++;
504 1.1 jtc break;
505 1.1 jtc case '|':
506 1.1 jtc SETERROR(REG_EMPTY);
507 1.1 jtc break;
508 1.1 jtc case '*':
509 1.1 jtc case '+':
510 1.1 jtc case '?':
511 1.39 christos case '{':
512 1.1 jtc SETERROR(REG_BADRPT);
513 1.1 jtc break;
514 1.1 jtc case '.':
515 1.1 jtc if (p->g->cflags®_NEWLINE)
516 1.1 jtc nonnewline(p);
517 1.1 jtc else
518 1.1 jtc EMIT(OANY, 0);
519 1.1 jtc break;
520 1.1 jtc case '[':
521 1.1 jtc p_bracket(p);
522 1.1 jtc break;
523 1.1 jtc case '\\':
524 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
525 1.39 christos wc = WGETNEXT();
526 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
527 1.39 christos if (p->gnuext) {
528 1.39 christos handled = 1;
529 1.39 christos switch (wc) {
530 1.39 christos case '`':
531 1.39 christos EMIT(OBOS, 0);
532 1.39 christos break;
533 1.39 christos case '\'':
534 1.39 christos EMIT(OEOS, 0);
535 1.39 christos break;
536 1.39 christos case 'B':
537 1.39 christos EMIT(ONWBND, 0);
538 1.39 christos break;
539 1.39 christos case 'b':
540 1.39 christos EMIT(OWBND, 0);
541 1.39 christos break;
542 1.39 christos case 'W':
543 1.39 christos case 'w':
544 1.39 christos case 'S':
545 1.39 christos case 's':
546 1.39 christos p_b_pseudoclass(p, wc);
547 1.39 christos break;
548 1.46 christos case 'a':
549 1.46 christos ordinary(p, '\a');
550 1.46 christos break;
551 1.46 christos case 'e':
552 1.46 christos ordinary(p, '\e');
553 1.46 christos break;
554 1.46 christos case 'f':
555 1.46 christos ordinary(p, '\f');
556 1.46 christos break;
557 1.46 christos case 'n':
558 1.46 christos ordinary(p, '\n');
559 1.46 christos break;
560 1.46 christos case 'r':
561 1.46 christos ordinary(p, '\r');
562 1.46 christos break;
563 1.46 christos case 't':
564 1.46 christos ordinary(p, '\t');
565 1.46 christos break;
566 1.46 christos case 'v':
567 1.46 christos ordinary(p, '\v');
568 1.46 christos break;
569 1.39 christos case '1':
570 1.39 christos case '2':
571 1.39 christos case '3':
572 1.39 christos case '4':
573 1.39 christos case '5':
574 1.39 christos case '6':
575 1.39 christos case '7':
576 1.39 christos case '8':
577 1.39 christos case '9':
578 1.39 christos i = wc - '0';
579 1.39 christos assert(i < NPAREN);
580 1.39 christos if (p->pend[i] != 0) {
581 1.39 christos assert(i <= p->g->nsub);
582 1.39 christos EMIT(OBACK_, i);
583 1.39 christos assert(p->pbegin[i] != 0);
584 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
585 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
586 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
587 1.39 christos EMIT(O_BACK, i);
588 1.39 christos } else
589 1.39 christos SETERROR(REG_ESUBREG);
590 1.39 christos p->g->backrefs = 1;
591 1.39 christos break;
592 1.39 christos default:
593 1.39 christos handled = 0;
594 1.39 christos }
595 1.39 christos /* Don't proceed to the POSIX bits if we've already handled it */
596 1.39 christos if (handled)
597 1.39 christos break;
598 1.39 christos }
599 1.39 christos #endif
600 1.39 christos switch (wc) {
601 1.39 christos case '<':
602 1.39 christos EMIT(OBOW, 0);
603 1.39 christos break;
604 1.39 christos case '>':
605 1.39 christos EMIT(OEOW, 0);
606 1.39 christos break;
607 1.39 christos default:
608 1.39 christos if (may_escape(p, wc))
609 1.39 christos ordinary(p, wc);
610 1.39 christos else
611 1.39 christos SETERROR(REG_EESCAPE);
612 1.39 christos break;
613 1.39 christos }
614 1.1 jtc break;
615 1.1 jtc default:
616 1.38 christos if (p->error != 0)
617 1.39 christos return (false);
618 1.39 christos p->next--;
619 1.39 christos wc = WGETNEXT();
620 1.39 christos ordinary(p, wc);
621 1.1 jtc break;
622 1.1 jtc }
623 1.1 jtc
624 1.1 jtc if (!MORE())
625 1.39 christos return (false);
626 1.1 jtc c = PEEK();
627 1.1 jtc /* we call { a repetition if followed by a digit */
628 1.39 christos if (!( c == '*' || c == '+' || c == '?' || c == '{'))
629 1.39 christos return (false); /* no repetition, we're done */
630 1.39 christos else if (c == '{')
631 1.39 christos (void)REQUIRE(MORE2() && \
632 1.39 christos (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
633 1.1 jtc NEXT();
634 1.1 jtc
635 1.39 christos (void)REQUIRE(!wascaret, REG_BADRPT);
636 1.1 jtc switch (c) {
637 1.1 jtc case '*': /* implemented as +? */
638 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
639 1.1 jtc INSERT(OPLUS_, pos);
640 1.1 jtc ASTERN(O_PLUS, pos);
641 1.1 jtc INSERT(OQUEST_, pos);
642 1.1 jtc ASTERN(O_QUEST, pos);
643 1.1 jtc break;
644 1.1 jtc case '+':
645 1.1 jtc INSERT(OPLUS_, pos);
646 1.1 jtc ASTERN(O_PLUS, pos);
647 1.1 jtc break;
648 1.1 jtc case '?':
649 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
650 1.4 jtc INSERT(OCH_, pos); /* offset slightly wrong */
651 1.4 jtc ASTERN(OOR1, pos); /* this one's right */
652 1.4 jtc AHEAD(pos); /* fix the OCH_ */
653 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
654 1.4 jtc AHEAD(THERE()); /* ...so fix it */
655 1.4 jtc ASTERN(O_CH, THERETHERE());
656 1.1 jtc break;
657 1.1 jtc case '{':
658 1.1 jtc count = p_count(p);
659 1.1 jtc if (EAT(',')) {
660 1.39 christos if (isdigit((uch)PEEK())) {
661 1.1 jtc count2 = p_count(p);
662 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
663 1.1 jtc } else /* single number with comma */
664 1.1 jtc count2 = INFINITY;
665 1.1 jtc } else /* just a single number */
666 1.1 jtc count2 = count;
667 1.39 christos repeat(p, pos, count, count2);
668 1.1 jtc if (!EAT('}')) { /* error heuristics */
669 1.1 jtc while (MORE() && PEEK() != '}')
670 1.1 jtc NEXT();
671 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
672 1.1 jtc SETERROR(REG_BADBR);
673 1.1 jtc }
674 1.1 jtc break;
675 1.1 jtc }
676 1.1 jtc
677 1.1 jtc if (!MORE())
678 1.39 christos return (false);
679 1.1 jtc c = PEEK();
680 1.1 jtc if (!( c == '*' || c == '+' || c == '?' ||
681 1.39 christos (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
682 1.39 christos return (false);
683 1.1 jtc SETERROR(REG_BADRPT);
684 1.39 christos return (false);
685 1.1 jtc }
686 1.1 jtc
687 1.1 jtc /*
688 1.1 jtc - p_str - string (no metacharacters) "parser"
689 1.9 perry == static void p_str(struct parse *p);
690 1.1 jtc */
691 1.1 jtc static void
692 1.39 christos p_str(struct parse *p)
693 1.1 jtc {
694 1.39 christos (void)REQUIRE(MORE(), REG_EMPTY);
695 1.39 christos while (MORE())
696 1.39 christos ordinary(p, WGETNEXT());
697 1.39 christos }
698 1.14 lukem
699 1.39 christos /*
700 1.39 christos * Eat consecutive branch delimiters for the kind of expression that we are
701 1.39 christos * parsing, return the number of delimiters that we ate.
702 1.39 christos */
703 1.39 christos static int
704 1.39 christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
705 1.39 christos {
706 1.39 christos int nskip;
707 1.14 lukem
708 1.39 christos (void)bc;
709 1.39 christos nskip = 0;
710 1.39 christos while (EATSPEC('|'))
711 1.39 christos ++nskip;
712 1.39 christos return (nskip);
713 1.1 jtc }
714 1.1 jtc
715 1.1 jtc /*
716 1.39 christos * Insert necessary branch book-keeping operations. This emits a
717 1.39 christos * bogus 'next' offset, since we still have more to parse
718 1.1 jtc */
719 1.1 jtc static void
720 1.39 christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
721 1.9 perry {
722 1.1 jtc
723 1.39 christos if (bc->nbranch == 0) {
724 1.39 christos INSERT(OCH_, bc->start); /* offset is wrong */
725 1.39 christos bc->fwd = bc->start;
726 1.39 christos bc->back = bc->start;
727 1.39 christos }
728 1.39 christos
729 1.39 christos ASTERN(OOR1, bc->back);
730 1.39 christos bc->back = THERE();
731 1.39 christos AHEAD(bc->fwd); /* fix previous offset */
732 1.39 christos bc->fwd = HERE();
733 1.39 christos EMIT(OOR2, 0); /* offset is very wrong */
734 1.39 christos ++bc->nbranch;
735 1.39 christos }
736 1.39 christos
737 1.39 christos /*
738 1.39 christos * Fix the offset of the tail branch, if we actually had any branches.
739 1.39 christos * This is to correct the bogus placeholder offset that we use.
740 1.39 christos */
741 1.39 christos static void
742 1.39 christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
743 1.39 christos {
744 1.14 lukem
745 1.39 christos /* Fix bogus offset at the tail if we actually have branches */
746 1.39 christos if (bc->nbranch > 0) {
747 1.39 christos AHEAD(bc->fwd);
748 1.39 christos ASTERN(O_CH, bc->back);
749 1.30 christos }
750 1.39 christos }
751 1.39 christos
752 1.39 christos /*
753 1.39 christos * Signal to the parser that an empty branch has been encountered; this will,
754 1.39 christos * in the future, be used to allow for more permissive behavior with empty
755 1.39 christos * branches. The return value should indicate whether parsing may continue
756 1.39 christos * or not.
757 1.39 christos */
758 1.39 christos static bool
759 1.39 christos p_branch_empty(struct parse *p, struct branchc *bc)
760 1.39 christos {
761 1.39 christos
762 1.39 christos (void)bc;
763 1.39 christos SETERROR(REG_EMPTY);
764 1.39 christos return (false);
765 1.39 christos }
766 1.39 christos
767 1.39 christos /*
768 1.39 christos * Take care of any branching requirements. This includes inserting the
769 1.39 christos * appropriate branching instructions as well as eating all of the branch
770 1.39 christos * delimiters until we either run out of pattern or need to parse more pattern.
771 1.39 christos */
772 1.39 christos static bool
773 1.39 christos p_branch_do(struct parse *p, struct branchc *bc)
774 1.39 christos {
775 1.39 christos int ate = 0;
776 1.39 christos
777 1.39 christos ate = p_branch_eat_delim(p, bc);
778 1.39 christos if (ate == 0)
779 1.39 christos return (false);
780 1.39 christos else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
781 1.39 christos /*
782 1.39 christos * Halt parsing only if we have an empty branch and p_branch_empty
783 1.39 christos * indicates that we must not continue. In the future, this will not
784 1.39 christos * necessarily be an error.
785 1.39 christos */
786 1.39 christos return (false);
787 1.39 christos p_branch_ins_offset(p, bc);
788 1.39 christos
789 1.39 christos return (true);
790 1.39 christos }
791 1.30 christos
792 1.39 christos static void
793 1.39 christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
794 1.39 christos {
795 1.14 lukem
796 1.40 christos (void)bc;
797 1.39 christos /*
798 1.39 christos * Does not move cleanly into expression parser because of
799 1.39 christos * ordinary interpration of * at the beginning position of
800 1.39 christos * an expression.
801 1.39 christos */
802 1.1 jtc if (EAT('^')) {
803 1.1 jtc EMIT(OBOL, 0);
804 1.1 jtc p->g->iflags |= USEBOL;
805 1.1 jtc p->g->nbol++;
806 1.1 jtc }
807 1.39 christos }
808 1.39 christos
809 1.39 christos static void
810 1.39 christos p_bre_post_parse(struct parse *p, struct branchc *bc)
811 1.39 christos {
812 1.39 christos
813 1.39 christos /* Expression is terminating due to EOL token */
814 1.39 christos if (bc->terminate) {
815 1.1 jtc DROP(1);
816 1.1 jtc EMIT(OEOL, 0);
817 1.1 jtc p->g->iflags |= USEEOL;
818 1.1 jtc p->g->neol++;
819 1.1 jtc }
820 1.39 christos }
821 1.1 jtc
822 1.39 christos /*
823 1.39 christos - p_re - Top level parser, concatenation and BRE anchoring
824 1.39 christos == static void p_re(struct parse *p, int end1, int end2);
825 1.39 christos * Giving end1 as OUT essentially eliminates the end1/end2 check.
826 1.39 christos *
827 1.39 christos * This implementation is a bit of a kludge, in that a trailing $ is first
828 1.39 christos * taken as an ordinary character and then revised to be an anchor.
829 1.39 christos * The amount of lookahead needed to avoid this kludge is excessive.
830 1.39 christos */
831 1.39 christos static void
832 1.39 christos p_re(struct parse *p,
833 1.39 christos int end1, /* first terminating character */
834 1.39 christos int end2) /* second terminating character; ignored for EREs */
835 1.39 christos {
836 1.39 christos struct branchc bc;
837 1.39 christos
838 1.39 christos bc.nbranch = 0;
839 1.39 christos if (end1 == OUT && end2 == OUT)
840 1.39 christos bc.outer = true;
841 1.39 christos else
842 1.39 christos bc.outer = false;
843 1.39 christos #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
844 1.39 christos for (;;) {
845 1.39 christos bc.start = HERE();
846 1.39 christos bc.nchain = 0;
847 1.39 christos bc.terminate = false;
848 1.39 christos if (p->pre_parse != NULL)
849 1.39 christos p->pre_parse(p, &bc);
850 1.39 christos while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
851 1.39 christos bc.terminate = p->parse_expr(p, &bc);
852 1.39 christos ++bc.nchain;
853 1.39 christos }
854 1.39 christos if (p->post_parse != NULL)
855 1.39 christos p->post_parse(p, &bc);
856 1.39 christos (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
857 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
858 1.39 christos if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
859 1.39 christos break;
860 1.39 christos #endif
861 1.39 christos if (!p->allowbranch)
862 1.39 christos break;
863 1.39 christos /*
864 1.39 christos * p_branch_do's return value indicates whether we should
865 1.39 christos * continue parsing or not. This is both for correctness and
866 1.39 christos * a slight optimization, because it will check if we've
867 1.39 christos * encountered an empty branch or the end of the string
868 1.39 christos * immediately following a branch delimiter.
869 1.39 christos */
870 1.39 christos if (!p_branch_do(p, &bc))
871 1.39 christos break;
872 1.39 christos }
873 1.39 christos #undef SEE_END
874 1.39 christos if (p->allowbranch)
875 1.39 christos p_branch_fix_tail(p, &bc);
876 1.39 christos assert(!MORE() || SEE(end1));
877 1.1 jtc }
878 1.1 jtc
879 1.1 jtc /*
880 1.1 jtc - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
881 1.39 christos == static bool p_simp_re(struct parse *p, struct branchc *bc);
882 1.1 jtc */
883 1.39 christos static bool /* was the simple RE an unbackslashed $? */
884 1.39 christos p_simp_re(struct parse *p, struct branchc *bc)
885 1.1 jtc {
886 1.9 perry int c;
887 1.39 christos int cc; /* convenient/control character */
888 1.9 perry int count;
889 1.9 perry int count2;
890 1.39 christos sopno pos;
891 1.39 christos bool handled;
892 1.40 christos size_t i;
893 1.39 christos wint_t wc;
894 1.9 perry sopno subno;
895 1.1 jtc # define BACKSL (1<<CHAR_BIT)
896 1.1 jtc
897 1.39 christos pos = HERE(); /* repetition op, if any, covers from here */
898 1.39 christos handled = false;
899 1.1 jtc
900 1.1 jtc assert(MORE()); /* caller should have ensured this */
901 1.48 christos c = (uch)GETNEXT();
902 1.1 jtc if (c == '\\') {
903 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE);
904 1.48 christos cc = (uch)GETNEXT();
905 1.39 christos c = BACKSL | cc;
906 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
907 1.39 christos if (p->gnuext) {
908 1.39 christos handled = true;
909 1.39 christos switch (c) {
910 1.39 christos case BACKSL|'`':
911 1.39 christos EMIT(OBOS, 0);
912 1.39 christos break;
913 1.39 christos case BACKSL|'\'':
914 1.39 christos EMIT(OEOS, 0);
915 1.39 christos break;
916 1.39 christos case BACKSL|'B':
917 1.39 christos EMIT(ONWBND, 0);
918 1.39 christos break;
919 1.39 christos case BACKSL|'b':
920 1.39 christos EMIT(OWBND, 0);
921 1.39 christos break;
922 1.39 christos case BACKSL|'W':
923 1.39 christos case BACKSL|'w':
924 1.39 christos case BACKSL|'S':
925 1.39 christos case BACKSL|'s':
926 1.39 christos p_b_pseudoclass(p, cc);
927 1.39 christos break;
928 1.46 christos case BACKSL|'a':
929 1.46 christos ordinary(p, '\a');
930 1.46 christos break;
931 1.46 christos case BACKSL|'e':
932 1.46 christos ordinary(p, '\e');
933 1.46 christos break;
934 1.46 christos case BACKSL|'f':
935 1.46 christos ordinary(p, '\f');
936 1.46 christos break;
937 1.46 christos case BACKSL|'n':
938 1.46 christos ordinary(p, '\n');
939 1.46 christos break;
940 1.46 christos case BACKSL|'r':
941 1.46 christos ordinary(p, '\r');
942 1.46 christos break;
943 1.46 christos case BACKSL|'t':
944 1.46 christos ordinary(p, '\t');
945 1.46 christos break;
946 1.46 christos case BACKSL|'v':
947 1.46 christos ordinary(p, '\v');
948 1.46 christos break;
949 1.39 christos default:
950 1.39 christos handled = false;
951 1.39 christos }
952 1.39 christos }
953 1.39 christos #endif
954 1.1 jtc }
955 1.39 christos if (!handled) {
956 1.39 christos switch (c) {
957 1.39 christos case '.':
958 1.39 christos if (p->g->cflags®_NEWLINE)
959 1.39 christos nonnewline(p);
960 1.39 christos else
961 1.39 christos EMIT(OANY, 0);
962 1.39 christos break;
963 1.39 christos case '[':
964 1.39 christos p_bracket(p);
965 1.39 christos break;
966 1.39 christos case BACKSL|'<':
967 1.39 christos EMIT(OBOW, 0);
968 1.39 christos break;
969 1.39 christos case BACKSL|'>':
970 1.39 christos EMIT(OEOW, 0);
971 1.39 christos break;
972 1.39 christos case BACKSL|'{':
973 1.39 christos SETERROR(REG_BADRPT);
974 1.39 christos break;
975 1.39 christos case BACKSL|'(':
976 1.39 christos p->g->nsub++;
977 1.40 christos subno = (sopno)p->g->nsub;
978 1.39 christos if (subno < NPAREN)
979 1.39 christos p->pbegin[subno] = HERE();
980 1.39 christos EMIT(OLPAREN, subno);
981 1.39 christos /* the MORE here is an error heuristic */
982 1.39 christos if (MORE() && !SEETWO('\\', ')'))
983 1.39 christos p_re(p, '\\', ')');
984 1.39 christos if (subno < NPAREN) {
985 1.39 christos p->pend[subno] = HERE();
986 1.39 christos assert(p->pend[subno] != 0);
987 1.39 christos }
988 1.39 christos EMIT(ORPAREN, subno);
989 1.39 christos (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
990 1.39 christos break;
991 1.39 christos case BACKSL|')': /* should not get here -- must be user */
992 1.39 christos SETERROR(REG_EPAREN);
993 1.39 christos break;
994 1.39 christos case BACKSL|'1':
995 1.39 christos case BACKSL|'2':
996 1.39 christos case BACKSL|'3':
997 1.39 christos case BACKSL|'4':
998 1.39 christos case BACKSL|'5':
999 1.39 christos case BACKSL|'6':
1000 1.39 christos case BACKSL|'7':
1001 1.39 christos case BACKSL|'8':
1002 1.39 christos case BACKSL|'9':
1003 1.39 christos i = (c&~BACKSL) - '0';
1004 1.39 christos assert(i < NPAREN);
1005 1.39 christos if (p->pend[i] != 0) {
1006 1.39 christos assert(i <= p->g->nsub);
1007 1.39 christos EMIT(OBACK_, i);
1008 1.39 christos assert(p->pbegin[i] != 0);
1009 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
1010 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN);
1011 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
1012 1.39 christos EMIT(O_BACK, i);
1013 1.39 christos } else
1014 1.39 christos SETERROR(REG_ESUBREG);
1015 1.39 christos p->g->backrefs = 1;
1016 1.39 christos break;
1017 1.39 christos case '*':
1018 1.39 christos /*
1019 1.39 christos * Ordinary if used as the first character beyond BOL anchor of
1020 1.39 christos * a (sub-)expression, counts as a bad repetition operator if it
1021 1.39 christos * appears otherwise.
1022 1.39 christos */
1023 1.39 christos (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
1024 1.39 christos /* FALLTHROUGH */
1025 1.39 christos default:
1026 1.39 christos if (p->error != 0)
1027 1.39 christos return (false); /* Definitely not $... */
1028 1.39 christos p->next--;
1029 1.39 christos wc = WGETNEXT();
1030 1.39 christos if ((c & BACKSL) == 0 || may_escape(p, wc))
1031 1.39 christos ordinary(p, wc);
1032 1.39 christos else
1033 1.39 christos SETERROR(REG_EESCAPE);
1034 1.39 christos break;
1035 1.1 jtc }
1036 1.1 jtc }
1037 1.1 jtc
1038 1.1 jtc if (EAT('*')) { /* implemented as +? */
1039 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */
1040 1.1 jtc INSERT(OPLUS_, pos);
1041 1.1 jtc ASTERN(O_PLUS, pos);
1042 1.1 jtc INSERT(OQUEST_, pos);
1043 1.1 jtc ASTERN(O_QUEST, pos);
1044 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1045 1.39 christos } else if (p->gnuext && EATTWO('\\', '?')) {
1046 1.39 christos INSERT(OQUEST_, pos);
1047 1.39 christos ASTERN(O_QUEST, pos);
1048 1.39 christos } else if (p->gnuext && EATTWO('\\', '+')) {
1049 1.39 christos INSERT(OPLUS_, pos);
1050 1.39 christos ASTERN(O_PLUS, pos);
1051 1.39 christos #endif
1052 1.1 jtc } else if (EATTWO('\\', '{')) {
1053 1.1 jtc count = p_count(p);
1054 1.1 jtc if (EAT(',')) {
1055 1.39 christos if (MORE() && isdigit((uch)PEEK())) {
1056 1.1 jtc count2 = p_count(p);
1057 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR);
1058 1.1 jtc } else /* single number with comma */
1059 1.1 jtc count2 = INFINITY;
1060 1.1 jtc } else /* just a single number */
1061 1.1 jtc count2 = count;
1062 1.39 christos repeat(p, pos, count, count2);
1063 1.1 jtc if (!EATTWO('\\', '}')) { /* error heuristics */
1064 1.1 jtc while (MORE() && !SEETWO('\\', '}'))
1065 1.1 jtc NEXT();
1066 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE);
1067 1.1 jtc SETERROR(REG_BADBR);
1068 1.1 jtc }
1069 1.39 christos } else if (c == '$') /* $ (but not \$) ends it */
1070 1.39 christos return (true);
1071 1.1 jtc
1072 1.39 christos return (false);
1073 1.1 jtc }
1074 1.1 jtc
1075 1.1 jtc /*
1076 1.1 jtc - p_count - parse a repetition count
1077 1.9 perry == static int p_count(struct parse *p);
1078 1.1 jtc */
1079 1.1 jtc static int /* the value */
1080 1.39 christos p_count(struct parse *p)
1081 1.1 jtc {
1082 1.9 perry int count = 0;
1083 1.9 perry int ndigits = 0;
1084 1.1 jtc
1085 1.39 christos while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1086 1.48 christos count = count*10 + ((uch)GETNEXT() - '0');
1087 1.1 jtc ndigits++;
1088 1.1 jtc }
1089 1.1 jtc
1090 1.39 christos (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1091 1.1 jtc return(count);
1092 1.1 jtc }
1093 1.1 jtc
1094 1.1 jtc /*
1095 1.1 jtc - p_bracket - parse a bracketed character list
1096 1.9 perry == static void p_bracket(struct parse *p);
1097 1.1 jtc */
1098 1.1 jtc static void
1099 1.39 christos p_bracket(struct parse *p)
1100 1.1 jtc {
1101 1.14 lukem cset *cs;
1102 1.39 christos wint_t ch;
1103 1.14 lukem
1104 1.1 jtc /* Dept of Truly Sickening Special-Case Kludges */
1105 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1106 1.1 jtc EMIT(OBOW, 0);
1107 1.1 jtc NEXTn(6);
1108 1.1 jtc return;
1109 1.1 jtc }
1110 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1111 1.1 jtc EMIT(OEOW, 0);
1112 1.1 jtc NEXTn(6);
1113 1.1 jtc return;
1114 1.1 jtc }
1115 1.1 jtc
1116 1.39 christos if ((cs = allocset(p)) == NULL)
1117 1.39 christos return;
1118 1.39 christos
1119 1.39 christos if (p->g->cflags®_ICASE)
1120 1.39 christos cs->icase = 1;
1121 1.1 jtc if (EAT('^'))
1122 1.39 christos cs->invert = 1;
1123 1.1 jtc if (EAT(']'))
1124 1.39 christos CHadd(p, cs, ']');
1125 1.1 jtc else if (EAT('-'))
1126 1.39 christos CHadd(p, cs, '-');
1127 1.1 jtc while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1128 1.1 jtc p_b_term(p, cs);
1129 1.1 jtc if (EAT('-'))
1130 1.39 christos CHadd(p, cs, '-');
1131 1.39 christos (void)MUSTEAT(']', REG_EBRACK);
1132 1.1 jtc
1133 1.1 jtc if (p->error != 0) /* don't mess things up further */
1134 1.1 jtc return;
1135 1.1 jtc
1136 1.39 christos if (cs->invert && p->g->cflags®_NEWLINE)
1137 1.39 christos cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1138 1.1 jtc
1139 1.39 christos if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
1140 1.39 christos ordinary(p, ch);
1141 1.39 christos freeset(p, cs);
1142 1.39 christos } else
1143 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1144 1.39 christos }
1145 1.1 jtc
1146 1.39 christos static int
1147 1.39 christos p_range_cmp(wchar_t c1, wchar_t c2)
1148 1.39 christos {
1149 1.39 christos #ifdef REGEX_LIBC_COLLATE
1150 1.39 christos return __wcollate_range_cmp(c1, c2);
1151 1.42 christos #elif defined(NLS)
1152 1.39 christos /* Copied from libc/collate __wcollate_range_cmp */
1153 1.39 christos wchar_t s1[2], s2[2];
1154 1.1 jtc
1155 1.39 christos s1[0] = c1;
1156 1.39 christos s1[1] = L'\0';
1157 1.39 christos s2[0] = c2;
1158 1.39 christos s2[1] = L'\0';
1159 1.42 christos return wcscoll(s1, s2);
1160 1.42 christos #else
1161 1.42 christos char s1[2], s2[2];
1162 1.42 christos
1163 1.42 christos s1[0] = (char)c1;
1164 1.42 christos s1[1] = '\0';
1165 1.42 christos s2[0] = (char)c2;
1166 1.42 christos s2[1] = '\0';
1167 1.42 christos return strcoll(s1, s2);
1168 1.39 christos #endif
1169 1.1 jtc }
1170 1.1 jtc
1171 1.1 jtc /*
1172 1.1 jtc - p_b_term - parse one term of a bracketed character list
1173 1.9 perry == static void p_b_term(struct parse *p, cset *cs);
1174 1.1 jtc */
1175 1.1 jtc static void
1176 1.39 christos p_b_term(struct parse *p, cset *cs)
1177 1.1 jtc {
1178 1.9 perry char c;
1179 1.39 christos wint_t start, finish;
1180 1.39 christos wint_t i;
1181 1.39 christos #ifdef REGEX_LIBC_COLLATE
1182 1.39 christos struct xlocale_collate *table =
1183 1.39 christos (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1184 1.39 christos #endif
1185 1.1 jtc
1186 1.14 lukem _DIAGASSERT(p != NULL);
1187 1.14 lukem _DIAGASSERT(cs != NULL);
1188 1.14 lukem
1189 1.1 jtc /* classify what we've got */
1190 1.1 jtc switch ((MORE()) ? PEEK() : '\0') {
1191 1.1 jtc case '[':
1192 1.1 jtc c = (MORE2()) ? PEEK2() : '\0';
1193 1.1 jtc break;
1194 1.1 jtc case '-':
1195 1.1 jtc SETERROR(REG_ERANGE);
1196 1.1 jtc return; /* NOTE RETURN */
1197 1.1 jtc default:
1198 1.1 jtc c = '\0';
1199 1.1 jtc break;
1200 1.1 jtc }
1201 1.1 jtc
1202 1.1 jtc switch (c) {
1203 1.1 jtc case ':': /* character class */
1204 1.1 jtc NEXT2();
1205 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1206 1.1 jtc c = PEEK();
1207 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1208 1.1 jtc p_b_cclass(p, cs);
1209 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1210 1.39 christos (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1211 1.1 jtc break;
1212 1.1 jtc case '=': /* equivalence class */
1213 1.1 jtc NEXT2();
1214 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1215 1.1 jtc c = PEEK();
1216 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1217 1.1 jtc p_b_eclass(p, cs);
1218 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1219 1.39 christos (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1220 1.1 jtc break;
1221 1.1 jtc default: /* symbol, ordinary character, or range */
1222 1.1 jtc start = p_b_symbol(p);
1223 1.1 jtc if (SEE('-') && MORE2() && PEEK2() != ']') {
1224 1.1 jtc /* range */
1225 1.1 jtc NEXT();
1226 1.1 jtc if (EAT('-'))
1227 1.1 jtc finish = '-';
1228 1.1 jtc else
1229 1.1 jtc finish = p_b_symbol(p);
1230 1.1 jtc } else
1231 1.1 jtc finish = start;
1232 1.39 christos if (start == finish)
1233 1.39 christos CHadd(p, cs, start);
1234 1.39 christos else {
1235 1.39 christos #ifdef REGEX_LIBC_COLLATE
1236 1.39 christos if (table->__collate_load_error || MB_CUR_MAX > 1) {
1237 1.39 christos #else
1238 1.39 christos if (MB_CUR_MAX > 1) {
1239 1.39 christos #endif
1240 1.39 christos (void)REQUIRE(start <= finish, REG_ERANGE);
1241 1.39 christos CHaddrange(p, cs, start, finish);
1242 1.39 christos } else {
1243 1.39 christos (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1244 1.39 christos for (i = 0; i <= UCHAR_MAX; i++) {
1245 1.39 christos if (p_range_cmp(start, i) <= 0 &&
1246 1.39 christos p_range_cmp(i, finish) <= 0 )
1247 1.39 christos CHadd(p, cs, i);
1248 1.39 christos }
1249 1.39 christos }
1250 1.39 christos }
1251 1.39 christos break;
1252 1.39 christos }
1253 1.39 christos }
1254 1.39 christos
1255 1.39 christos #ifdef REGEX_GNU_EXTENSIONS
1256 1.39 christos /*
1257 1.39 christos - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1258 1.39 christos == static int p_b_pseudoclass(struct parse *p, char c)
1259 1.39 christos */
1260 1.39 christos static int
1261 1.39 christos p_b_pseudoclass(struct parse *p, char c) {
1262 1.39 christos cset *cs;
1263 1.39 christos
1264 1.39 christos if ((cs = allocset(p)) == NULL)
1265 1.39 christos return(0);
1266 1.39 christos
1267 1.39 christos if (p->g->cflags®_ICASE)
1268 1.39 christos cs->icase = 1;
1269 1.39 christos
1270 1.39 christos switch (c) {
1271 1.39 christos case 'W':
1272 1.39 christos cs->invert = 1;
1273 1.39 christos /* FALLTHROUGH */
1274 1.39 christos case 'w':
1275 1.39 christos p_b_cclass_named(p, cs, "alnum");
1276 1.1 jtc break;
1277 1.39 christos case 'S':
1278 1.39 christos cs->invert = 1;
1279 1.39 christos /* FALLTHROUGH */
1280 1.39 christos case 's':
1281 1.39 christos p_b_cclass_named(p, cs, "space");
1282 1.39 christos break;
1283 1.39 christos default:
1284 1.39 christos return(0);
1285 1.1 jtc }
1286 1.39 christos
1287 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1288 1.39 christos return(1);
1289 1.1 jtc }
1290 1.39 christos #endif
1291 1.1 jtc
1292 1.1 jtc /*
1293 1.1 jtc - p_b_cclass - parse a character-class name and deal with it
1294 1.9 perry == static void p_b_cclass(struct parse *p, cset *cs);
1295 1.1 jtc */
1296 1.1 jtc static void
1297 1.39 christos p_b_cclass(struct parse *p, cset *cs)
1298 1.1 jtc {
1299 1.39 christos const char *sp = p->next;
1300 1.9 perry size_t len;
1301 1.39 christos char clname[16];
1302 1.1 jtc
1303 1.39 christos while (MORE() && isalpha((uch)PEEK()))
1304 1.1 jtc NEXT();
1305 1.1 jtc len = p->next - sp;
1306 1.39 christos if (len >= sizeof(clname) - 1) {
1307 1.1 jtc SETERROR(REG_ECTYPE);
1308 1.1 jtc return;
1309 1.1 jtc }
1310 1.39 christos memcpy(clname, sp, len);
1311 1.39 christos clname[len] = '\0';
1312 1.39 christos
1313 1.39 christos p_b_cclass_named(p, cs, clname);
1314 1.39 christos }
1315 1.42 christos
1316 1.39 christos /*
1317 1.39 christos - p_b_cclass_named - deal with a named character class
1318 1.39 christos == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1319 1.39 christos */
1320 1.39 christos static void
1321 1.39 christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1322 1.39 christos wctype_t wct;
1323 1.1 jtc
1324 1.39 christos if ((wct = wctype(clname)) == 0) {
1325 1.39 christos SETERROR(REG_ECTYPE);
1326 1.39 christos return;
1327 1.39 christos }
1328 1.39 christos CHaddtype(p, cs, wct);
1329 1.1 jtc }
1330 1.1 jtc
1331 1.1 jtc /*
1332 1.1 jtc - p_b_eclass - parse an equivalence-class name and deal with it
1333 1.9 perry == static void p_b_eclass(struct parse *p, cset *cs);
1334 1.1 jtc *
1335 1.1 jtc * This implementation is incomplete. xxx
1336 1.1 jtc */
1337 1.1 jtc static void
1338 1.39 christos p_b_eclass(struct parse *p, cset *cs)
1339 1.1 jtc {
1340 1.39 christos wint_t c;
1341 1.1 jtc
1342 1.14 lukem _DIAGASSERT(p != NULL);
1343 1.14 lukem _DIAGASSERT(cs != NULL);
1344 1.14 lukem
1345 1.1 jtc c = p_b_coll_elem(p, '=');
1346 1.39 christos CHadd(p, cs, c);
1347 1.1 jtc }
1348 1.1 jtc
1349 1.1 jtc /*
1350 1.1 jtc - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1351 1.39 christos == static wint_t p_b_symbol(struct parse *p);
1352 1.1 jtc */
1353 1.39 christos static wint_t /* value of symbol */
1354 1.39 christos p_b_symbol(struct parse *p)
1355 1.1 jtc {
1356 1.39 christos wint_t value;
1357 1.1 jtc
1358 1.14 lukem _DIAGASSERT(p != NULL);
1359 1.14 lukem
1360 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK);
1361 1.1 jtc if (!EATTWO('[', '.'))
1362 1.39 christos return(WGETNEXT());
1363 1.1 jtc
1364 1.1 jtc /* collating symbol */
1365 1.1 jtc value = p_b_coll_elem(p, '.');
1366 1.39 christos (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1367 1.1 jtc return(value);
1368 1.1 jtc }
1369 1.1 jtc
1370 1.1 jtc /*
1371 1.1 jtc - p_b_coll_elem - parse a collating-element name and look it up
1372 1.39 christos == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1373 1.1 jtc */
1374 1.39 christos static wint_t /* value of collating element */
1375 1.39 christos p_b_coll_elem(struct parse *p,
1376 1.39 christos wint_t endc) /* name ended by endc,']' */
1377 1.39 christos {
1378 1.39 christos const char *sp = p->next;
1379 1.39 christos struct cname *cp;
1380 1.42 christos size_t len;
1381 1.1 jtc
1382 1.14 lukem _DIAGASSERT(p != NULL);
1383 1.14 lukem
1384 1.1 jtc while (MORE() && !SEETWO(endc, ']'))
1385 1.1 jtc NEXT();
1386 1.1 jtc if (!MORE()) {
1387 1.1 jtc SETERROR(REG_EBRACK);
1388 1.1 jtc return(0);
1389 1.1 jtc }
1390 1.1 jtc len = p->next - sp;
1391 1.1 jtc for (cp = cnames; cp->name != NULL; cp++)
1392 1.37 christos if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1393 1.1 jtc return(cp->code); /* known name */
1394 1.42 christos #ifdef NLS
1395 1.42 christos mbstate_t mbs;
1396 1.42 christos wchar_t wc;
1397 1.42 christos size_t clen;
1398 1.42 christos
1399 1.39 christos memset(&mbs, 0, sizeof(mbs));
1400 1.39 christos if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1401 1.39 christos return (wc); /* single character */
1402 1.39 christos else if (clen == (size_t)-1 || clen == (size_t)-2)
1403 1.39 christos SETERROR(REG_ILLSEQ);
1404 1.39 christos else
1405 1.39 christos SETERROR(REG_ECOLLATE); /* neither */
1406 1.1 jtc return(0);
1407 1.42 christos #else
1408 1.42 christos if (len == 1)
1409 1.42 christos return *sp; /* single character */
1410 1.42 christos SETERROR(REG_ECOLLATE); /* neither */
1411 1.42 christos return 0;
1412 1.42 christos #endif
1413 1.1 jtc }
1414 1.1 jtc
1415 1.1 jtc /*
1416 1.39 christos - may_escape - determine whether 'ch' is escape-able in the current context
1417 1.39 christos == static int may_escape(struct parse *p, const wint_t ch)
1418 1.39 christos */
1419 1.39 christos static bool
1420 1.39 christos may_escape(struct parse *p, const wint_t ch)
1421 1.39 christos {
1422 1.39 christos
1423 1.39 christos if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1424 1.39 christos return (true);
1425 1.48 christos if (iswalpha(ch) || ch == '\'' || ch == '`')
1426 1.39 christos return (false);
1427 1.39 christos return (true);
1428 1.39 christos #ifdef NOTYET
1429 1.39 christos /*
1430 1.39 christos * Build a whitelist of characters that may be escaped to produce an
1431 1.39 christos * ordinary in the current context. This assumes that these have not
1432 1.39 christos * been otherwise interpreted as a special character. Escaping an
1433 1.39 christos * ordinary character yields undefined results according to
1434 1.39 christos * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1435 1.39 christos * advantage of this and use escaped ordinary characters to provide
1436 1.39 christos * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1437 1.39 christos */
1438 1.39 christos switch(ch) {
1439 1.39 christos case '|':
1440 1.39 christos case '+':
1441 1.39 christos case '?':
1442 1.39 christos /* The above characters may not be escaped in BREs */
1443 1.39 christos if (!(p->g->cflags®_EXTENDED))
1444 1.39 christos return (false);
1445 1.39 christos /* Fallthrough */
1446 1.39 christos case '(':
1447 1.39 christos case ')':
1448 1.39 christos case '{':
1449 1.39 christos case '}':
1450 1.39 christos case '.':
1451 1.39 christos case '[':
1452 1.39 christos case ']':
1453 1.39 christos case '\\':
1454 1.39 christos case '*':
1455 1.39 christos case '^':
1456 1.39 christos case '$':
1457 1.39 christos return (true);
1458 1.39 christos default:
1459 1.39 christos return (false);
1460 1.39 christos }
1461 1.39 christos #endif
1462 1.39 christos }
1463 1.39 christos
1464 1.39 christos /*
1465 1.1 jtc - othercase - return the case counterpart of an alphabetic
1466 1.39 christos == static wint_t othercase(wint_t ch);
1467 1.1 jtc */
1468 1.39 christos static wint_t /* if no counterpart, return ch */
1469 1.39 christos othercase(wint_t ch)
1470 1.39 christos {
1471 1.39 christos assert(iswalpha(ch));
1472 1.39 christos if (iswupper(ch))
1473 1.39 christos return(towlower(ch));
1474 1.39 christos else if (iswlower(ch))
1475 1.39 christos return(towupper(ch));
1476 1.1 jtc else /* peculiar, but could happen */
1477 1.1 jtc return(ch);
1478 1.1 jtc }
1479 1.1 jtc
1480 1.1 jtc /*
1481 1.1 jtc - bothcases - emit a dualcase version of a two-case character
1482 1.39 christos == static void bothcases(struct parse *p, wint_t ch);
1483 1.1 jtc *
1484 1.1 jtc * Boy, is this implementation ever a kludge...
1485 1.1 jtc */
1486 1.1 jtc static void
1487 1.39 christos bothcases(struct parse *p, wint_t ch)
1488 1.1 jtc {
1489 1.39 christos const char *oldnext = p->next;
1490 1.39 christos const char *oldend = p->end;
1491 1.39 christos char bracket[3 + MB_LEN_MAX];
1492 1.39 christos size_t n;
1493 1.1 jtc
1494 1.14 lukem _DIAGASSERT(p != NULL);
1495 1.14 lukem
1496 1.1 jtc assert(othercase(ch) != ch); /* p_bracket() would recurse */
1497 1.1 jtc p->next = bracket;
1498 1.42 christos #ifdef NLS
1499 1.42 christos mbstate_t mbs;
1500 1.39 christos memset(&mbs, 0, sizeof(mbs));
1501 1.39 christos n = wcrtomb(bracket, ch, &mbs);
1502 1.39 christos assert(n != (size_t)-1);
1503 1.42 christos #else
1504 1.42 christos n = 0;
1505 1.42 christos bracket[n++] = ch;
1506 1.42 christos #endif
1507 1.39 christos bracket[n] = ']';
1508 1.39 christos bracket[n + 1] = '\0';
1509 1.39 christos p->end = bracket+n+1;
1510 1.1 jtc p_bracket(p);
1511 1.39 christos assert(p->next == p->end);
1512 1.1 jtc p->next = oldnext;
1513 1.1 jtc p->end = oldend;
1514 1.1 jtc }
1515 1.1 jtc
1516 1.1 jtc /*
1517 1.1 jtc - ordinary - emit an ordinary character
1518 1.39 christos == static void ordinary(struct parse *p, wint_t ch);
1519 1.1 jtc */
1520 1.1 jtc static void
1521 1.39 christos ordinary(struct parse *p, wint_t ch)
1522 1.1 jtc {
1523 1.39 christos cset *cs;
1524 1.1 jtc
1525 1.14 lukem _DIAGASSERT(p != NULL);
1526 1.14 lukem
1527 1.39 christos if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1528 1.39 christos bothcases(p, ch);
1529 1.40 christos else if ((wint_t)(ch & OPDMASK) == ch)
1530 1.40 christos EMIT(OCHAR, (size_t)ch);
1531 1.1 jtc else {
1532 1.39 christos /*
1533 1.39 christos * Kludge: character is too big to fit into an OCHAR operand.
1534 1.39 christos * Emit a singleton set.
1535 1.39 christos */
1536 1.39 christos if ((cs = allocset(p)) == NULL)
1537 1.39 christos return;
1538 1.39 christos CHadd(p, cs, ch);
1539 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets));
1540 1.1 jtc }
1541 1.1 jtc }
1542 1.1 jtc
1543 1.1 jtc /*
1544 1.1 jtc - nonnewline - emit REG_NEWLINE version of OANY
1545 1.9 perry == static void nonnewline(struct parse *p);
1546 1.1 jtc *
1547 1.1 jtc * Boy, is this implementation ever a kludge...
1548 1.1 jtc */
1549 1.1 jtc static void
1550 1.39 christos nonnewline(struct parse *p)
1551 1.1 jtc {
1552 1.39 christos const char *oldnext = p->next;
1553 1.39 christos const char *oldend = p->end;
1554 1.1 jtc char bracket[4];
1555 1.1 jtc
1556 1.14 lukem _DIAGASSERT(p != NULL);
1557 1.14 lukem
1558 1.1 jtc p->next = bracket;
1559 1.1 jtc p->end = bracket+3;
1560 1.1 jtc bracket[0] = '^';
1561 1.1 jtc bracket[1] = '\n';
1562 1.1 jtc bracket[2] = ']';
1563 1.1 jtc bracket[3] = '\0';
1564 1.1 jtc p_bracket(p);
1565 1.1 jtc assert(p->next == bracket+3);
1566 1.1 jtc p->next = oldnext;
1567 1.1 jtc p->end = oldend;
1568 1.1 jtc }
1569 1.1 jtc
1570 1.1 jtc /*
1571 1.1 jtc - repeat - generate code for a bounded repetition, recursively if needed
1572 1.39 christos == static void repeat(struct parse *p, sopno start, int from, int to);
1573 1.1 jtc */
1574 1.1 jtc static void
1575 1.39 christos repeat(struct parse *p,
1576 1.39 christos sopno start, /* operand from here to end of strip */
1577 1.39 christos int from, /* repeated from this number */
1578 1.39 christos int to) /* to this number of times (maybe INFINITY) */
1579 1.1 jtc {
1580 1.39 christos sopno finish = HERE();
1581 1.1 jtc # define N 2
1582 1.1 jtc # define INF 3
1583 1.1 jtc # define REP(f, t) ((f)*8 + (t))
1584 1.1 jtc # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1585 1.9 perry sopno copy;
1586 1.1 jtc
1587 1.14 lukem _DIAGASSERT(p != NULL);
1588 1.14 lukem
1589 1.39 christos if (p->error != 0) /* head off possible runaway recursion */
1590 1.30 christos return;
1591 1.30 christos
1592 1.1 jtc assert(from <= to);
1593 1.1 jtc
1594 1.1 jtc switch (REP(MAP(from), MAP(to))) {
1595 1.1 jtc case REP(0, 0): /* must be user doing this */
1596 1.1 jtc DROP(finish-start); /* drop the operand */
1597 1.1 jtc break;
1598 1.1 jtc case REP(0, 1): /* as x{1,1}? */
1599 1.1 jtc case REP(0, N): /* as x{1,n}? */
1600 1.1 jtc case REP(0, INF): /* as x{1,}? */
1601 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1602 1.4 jtc INSERT(OCH_, start); /* offset is wrong... */
1603 1.39 christos repeat(p, start+1, 1, to);
1604 1.4 jtc ASTERN(OOR1, start);
1605 1.1 jtc AHEAD(start); /* ... fix it */
1606 1.4 jtc EMIT(OOR2, 0);
1607 1.4 jtc AHEAD(THERE());
1608 1.4 jtc ASTERN(O_CH, THERETHERE());
1609 1.1 jtc break;
1610 1.1 jtc case REP(1, 1): /* trivial case */
1611 1.1 jtc /* done */
1612 1.1 jtc break;
1613 1.1 jtc case REP(1, N): /* as x?x{1,n-1} */
1614 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1615 1.4 jtc INSERT(OCH_, start);
1616 1.4 jtc ASTERN(OOR1, start);
1617 1.4 jtc AHEAD(start);
1618 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */
1619 1.4 jtc AHEAD(THERE()); /* ...so fix it */
1620 1.4 jtc ASTERN(O_CH, THERETHERE());
1621 1.1 jtc copy = dupl(p, start+1, finish+1);
1622 1.4 jtc assert(copy == finish+4);
1623 1.39 christos repeat(p, copy, 1, to-1);
1624 1.1 jtc break;
1625 1.1 jtc case REP(1, INF): /* as x+ */
1626 1.1 jtc INSERT(OPLUS_, start);
1627 1.1 jtc ASTERN(O_PLUS, start);
1628 1.1 jtc break;
1629 1.1 jtc case REP(N, N): /* as xx{m-1,n-1} */
1630 1.1 jtc copy = dupl(p, start, finish);
1631 1.39 christos repeat(p, copy, from-1, to-1);
1632 1.1 jtc break;
1633 1.1 jtc case REP(N, INF): /* as xx{n-1,INF} */
1634 1.1 jtc copy = dupl(p, start, finish);
1635 1.39 christos repeat(p, copy, from-1, to);
1636 1.1 jtc break;
1637 1.1 jtc default: /* "can't happen" */
1638 1.1 jtc SETERROR(REG_ASSERT); /* just in case */
1639 1.1 jtc break;
1640 1.1 jtc }
1641 1.1 jtc }
1642 1.1 jtc
1643 1.1 jtc /*
1644 1.39 christos - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1645 1.39 christos - character from the parse struct, signals a REG_ILLSEQ error if the
1646 1.39 christos - character can't be converted. Returns the number of bytes consumed.
1647 1.39 christos */
1648 1.39 christos static wint_t
1649 1.39 christos wgetnext(struct parse *p)
1650 1.39 christos {
1651 1.42 christos #ifdef NLS
1652 1.39 christos mbstate_t mbs;
1653 1.39 christos wchar_t wc;
1654 1.39 christos size_t n;
1655 1.39 christos
1656 1.39 christos memset(&mbs, 0, sizeof(mbs));
1657 1.40 christos n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1658 1.39 christos if (n == (size_t)-1 || n == (size_t)-2) {
1659 1.39 christos SETERROR(REG_ILLSEQ);
1660 1.39 christos return (0);
1661 1.39 christos }
1662 1.39 christos if (n == 0)
1663 1.39 christos n = 1;
1664 1.39 christos p->next += n;
1665 1.42 christos return wc;
1666 1.42 christos #else
1667 1.42 christos return *p->next++;
1668 1.42 christos #endif
1669 1.39 christos }
1670 1.39 christos
1671 1.39 christos /*
1672 1.1 jtc - seterr - set an error condition
1673 1.9 perry == static int seterr(struct parse *p, int e);
1674 1.1 jtc */
1675 1.1 jtc static int /* useless but makes type checking happy */
1676 1.39 christos seterr(struct parse *p, int e)
1677 1.1 jtc {
1678 1.14 lukem
1679 1.14 lukem _DIAGASSERT(p != NULL);
1680 1.14 lukem
1681 1.1 jtc if (p->error == 0) /* keep earliest error condition */
1682 1.1 jtc p->error = e;
1683 1.1 jtc p->next = nuls; /* try to bring things to a halt */
1684 1.1 jtc p->end = nuls;
1685 1.1 jtc return(0); /* make the return value well-defined */
1686 1.1 jtc }
1687 1.1 jtc
1688 1.1 jtc /*
1689 1.1 jtc - allocset - allocate a set of characters for []
1690 1.9 perry == static cset *allocset(struct parse *p);
1691 1.1 jtc */
1692 1.1 jtc static cset *
1693 1.39 christos allocset(struct parse *p)
1694 1.1 jtc {
1695 1.39 christos cset *cs, *ncs;
1696 1.1 jtc
1697 1.14 lukem _DIAGASSERT(p != NULL);
1698 1.14 lukem
1699 1.39 christos ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1700 1.39 christos if (ncs == NULL) {
1701 1.39 christos SETERROR(REG_ESPACE);
1702 1.39 christos return (NULL);
1703 1.1 jtc }
1704 1.39 christos p->g->sets = ncs;
1705 1.39 christos cs = &p->g->sets[p->g->ncsets++];
1706 1.39 christos memset(cs, 0, sizeof(*cs));
1707 1.1 jtc
1708 1.1 jtc return(cs);
1709 1.1 jtc }
1710 1.1 jtc
1711 1.1 jtc /*
1712 1.1 jtc - freeset - free a now-unused set
1713 1.9 perry == static void freeset(struct parse *p, cset *cs);
1714 1.1 jtc */
1715 1.1 jtc static void
1716 1.39 christos freeset(struct parse *p, cset *cs)
1717 1.1 jtc {
1718 1.14 lukem cset *top;
1719 1.14 lukem
1720 1.14 lukem _DIAGASSERT(p != NULL);
1721 1.14 lukem _DIAGASSERT(cs != NULL);
1722 1.14 lukem
1723 1.14 lukem top = &p->g->sets[p->g->ncsets];
1724 1.1 jtc
1725 1.39 christos free(cs->wides);
1726 1.39 christos free(cs->ranges);
1727 1.39 christos free(cs->types);
1728 1.39 christos memset(cs, 0, sizeof(*cs));
1729 1.1 jtc if (cs == top-1) /* recover only the easy case */
1730 1.1 jtc p->g->ncsets--;
1731 1.1 jtc }
1732 1.1 jtc
1733 1.1 jtc /*
1734 1.39 christos - singleton - Determine whether a set contains only one character,
1735 1.39 christos - returning it if so, otherwise returning OUT.
1736 1.1 jtc */
1737 1.39 christos static wint_t
1738 1.39 christos singleton(cset *cs)
1739 1.1 jtc {
1740 1.39 christos wint_t i, s, n;
1741 1.1 jtc
1742 1.39 christos for (i = n = 0; i < NC; i++)
1743 1.39 christos if (CHIN(cs, i)) {
1744 1.1 jtc n++;
1745 1.39 christos s = i;
1746 1.39 christos }
1747 1.39 christos if (n == 1)
1748 1.39 christos return (s);
1749 1.39 christos if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1750 1.39 christos cs->icase == 0)
1751 1.39 christos return (cs->wides[0]);
1752 1.39 christos /* Don't bother handling the other cases. */
1753 1.39 christos return (OUT);
1754 1.1 jtc }
1755 1.1 jtc
1756 1.1 jtc /*
1757 1.39 christos - CHadd - add character to character set.
1758 1.1 jtc */
1759 1.1 jtc static void
1760 1.39 christos CHadd(struct parse *p, cset *cs, wint_t ch)
1761 1.1 jtc {
1762 1.39 christos wint_t nch, *newwides;
1763 1.14 lukem
1764 1.14 lukem _DIAGASSERT(p != NULL);
1765 1.14 lukem _DIAGASSERT(cs != NULL);
1766 1.14 lukem
1767 1.39 christos assert(ch >= 0);
1768 1.39 christos if (ch < NC)
1769 1.40 christos cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1770 1.39 christos else {
1771 1.39 christos newwides = reallocarray(cs->wides, cs->nwides + 1,
1772 1.39 christos sizeof(*cs->wides));
1773 1.39 christos if (newwides == NULL) {
1774 1.39 christos SETERROR(REG_ESPACE);
1775 1.39 christos return;
1776 1.39 christos }
1777 1.39 christos cs->wides = newwides;
1778 1.39 christos cs->wides[cs->nwides++] = ch;
1779 1.39 christos }
1780 1.39 christos if (cs->icase) {
1781 1.39 christos if ((nch = towlower(ch)) < NC)
1782 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1783 1.39 christos if ((nch = towupper(ch)) < NC)
1784 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1785 1.1 jtc }
1786 1.1 jtc }
1787 1.1 jtc
1788 1.1 jtc /*
1789 1.39 christos - CHaddrange - add all characters in the range [min,max] to a character set.
1790 1.1 jtc */
1791 1.1 jtc static void
1792 1.39 christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1793 1.1 jtc {
1794 1.39 christos crange *newranges;
1795 1.14 lukem
1796 1.39 christos _DIAGASSERT(p != NULL);
1797 1.14 lukem _DIAGASSERT(cs != NULL);
1798 1.14 lukem
1799 1.39 christos for (; min < NC && min <= max; min++)
1800 1.39 christos CHadd(p, cs, min);
1801 1.39 christos if (min >= max)
1802 1.39 christos return;
1803 1.39 christos newranges = reallocarray(cs->ranges, cs->nranges + 1,
1804 1.39 christos sizeof(*cs->ranges));
1805 1.39 christos if (newranges == NULL) {
1806 1.39 christos SETERROR(REG_ESPACE);
1807 1.1 jtc return;
1808 1.1 jtc }
1809 1.39 christos cs->ranges = newranges;
1810 1.39 christos cs->ranges[cs->nranges].min = min;
1811 1.39 christos cs->ranges[cs->nranges].max = max;
1812 1.39 christos cs->nranges++;
1813 1.1 jtc }
1814 1.1 jtc
1815 1.1 jtc /*
1816 1.39 christos - CHaddtype - add all characters of a certain type to a character set.
1817 1.1 jtc */
1818 1.1 jtc static void
1819 1.39 christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1820 1.1 jtc {
1821 1.39 christos wint_t i;
1822 1.39 christos wctype_t *newtypes;
1823 1.14 lukem
1824 1.14 lukem _DIAGASSERT(p != NULL);
1825 1.14 lukem _DIAGASSERT(cs != NULL);
1826 1.14 lukem
1827 1.39 christos for (i = 0; i < NC; i++)
1828 1.39 christos if (iswctype(i, wct))
1829 1.39 christos CHadd(p, cs, i);
1830 1.39 christos newtypes = reallocarray(cs->types, cs->ntypes + 1,
1831 1.39 christos sizeof(*cs->types));
1832 1.39 christos if (newtypes == NULL) {
1833 1.39 christos SETERROR(REG_ESPACE);
1834 1.1 jtc return;
1835 1.39 christos }
1836 1.39 christos cs->types = newtypes;
1837 1.39 christos cs->types[cs->ntypes++] = wct;
1838 1.1 jtc }
1839 1.1 jtc
1840 1.1 jtc /*
1841 1.1 jtc - dupl - emit a duplicate of a bunch of sops
1842 1.9 perry == static sopno dupl(struct parse *p, sopno start, sopno finish);
1843 1.1 jtc */
1844 1.1 jtc static sopno /* start of duplicate */
1845 1.39 christos dupl(struct parse *p,
1846 1.39 christos sopno start, /* from here */
1847 1.39 christos sopno finish) /* to this less one */
1848 1.1 jtc {
1849 1.39 christos sopno ret = HERE();
1850 1.9 perry sopno len = finish - start;
1851 1.1 jtc
1852 1.14 lukem _DIAGASSERT(p != NULL);
1853 1.14 lukem
1854 1.1 jtc assert(finish >= start);
1855 1.1 jtc if (len == 0)
1856 1.1 jtc return(ret);
1857 1.39 christos if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1858 1.39 christos return(ret);
1859 1.40 christos (void) memcpy(p->strip + p->slen,
1860 1.40 christos p->strip + start, len * sizeof(*p->strip));
1861 1.1 jtc p->slen += len;
1862 1.1 jtc return(ret);
1863 1.1 jtc }
1864 1.1 jtc
1865 1.1 jtc /*
1866 1.1 jtc - doemit - emit a strip operator
1867 1.9 perry == static void doemit(struct parse *p, sop op, size_t opnd);
1868 1.1 jtc *
1869 1.1 jtc * It might seem better to implement this as a macro with a function as
1870 1.1 jtc * hard-case backup, but it's just too big and messy unless there are
1871 1.1 jtc * some changes to the data structures. Maybe later.
1872 1.1 jtc */
1873 1.1 jtc static void
1874 1.39 christos doemit(struct parse *p, sop op, size_t opnd)
1875 1.1 jtc {
1876 1.1 jtc /* avoid making error situations worse */
1877 1.1 jtc if (p->error != 0)
1878 1.1 jtc return;
1879 1.1 jtc
1880 1.39 christos _DIAGASSERT(p != NULL);
1881 1.39 christos
1882 1.1 jtc /* deal with oversize operands ("can't happen", more or less) */
1883 1.1 jtc assert(opnd < 1<<OPSHIFT);
1884 1.1 jtc
1885 1.1 jtc /* deal with undersized strip */
1886 1.1 jtc if (p->slen >= p->ssize)
1887 1.30 christos if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1888 1.30 christos return;
1889 1.1 jtc
1890 1.1 jtc /* finally, it's all reduced to the easy case */
1891 1.40 christos p->strip[p->slen++] = (sopno)SOP(op, opnd);
1892 1.1 jtc }
1893 1.1 jtc
1894 1.1 jtc /*
1895 1.1 jtc - doinsert - insert a sop into the strip
1896 1.9 perry == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1897 1.1 jtc */
1898 1.1 jtc static void
1899 1.39 christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1900 1.1 jtc {
1901 1.9 perry sopno sn;
1902 1.9 perry sop s;
1903 1.9 perry int i;
1904 1.1 jtc
1905 1.14 lukem _DIAGASSERT(p != NULL);
1906 1.14 lukem
1907 1.1 jtc /* avoid making error situations worse */
1908 1.1 jtc if (p->error != 0)
1909 1.1 jtc return;
1910 1.1 jtc
1911 1.1 jtc sn = HERE();
1912 1.1 jtc EMIT(op, opnd); /* do checks, ensure space */
1913 1.1 jtc assert(HERE() == sn+1);
1914 1.1 jtc s = p->strip[sn];
1915 1.1 jtc
1916 1.1 jtc /* adjust paren pointers */
1917 1.1 jtc assert(pos > 0);
1918 1.1 jtc for (i = 1; i < NPAREN; i++) {
1919 1.1 jtc if (p->pbegin[i] >= pos) {
1920 1.1 jtc p->pbegin[i]++;
1921 1.1 jtc }
1922 1.1 jtc if (p->pend[i] >= pos) {
1923 1.1 jtc p->pend[i]++;
1924 1.1 jtc }
1925 1.1 jtc }
1926 1.1 jtc
1927 1.40 christos memmove(&p->strip[pos+1], &p->strip[pos],
1928 1.40 christos (HERE()-pos-1)*sizeof(*p->strip));
1929 1.1 jtc p->strip[pos] = s;
1930 1.1 jtc }
1931 1.1 jtc
1932 1.1 jtc /*
1933 1.1 jtc - dofwd - complete a forward reference
1934 1.9 perry == static void dofwd(struct parse *p, sopno pos, sop value);
1935 1.1 jtc */
1936 1.1 jtc static void
1937 1.39 christos dofwd(struct parse *p, sopno pos, sop value)
1938 1.1 jtc {
1939 1.14 lukem
1940 1.14 lukem _DIAGASSERT(p != NULL);
1941 1.14 lukem
1942 1.1 jtc /* avoid making error situations worse */
1943 1.1 jtc if (p->error != 0)
1944 1.1 jtc return;
1945 1.1 jtc
1946 1.1 jtc assert(value < 1<<OPSHIFT);
1947 1.39 christos p->strip[pos] = OP(p->strip[pos]) | value;
1948 1.1 jtc }
1949 1.1 jtc
1950 1.1 jtc /*
1951 1.1 jtc - enlarge - enlarge the strip
1952 1.39 christos == static int enlarge(struct parse *p, sopno size);
1953 1.1 jtc */
1954 1.30 christos static int
1955 1.35 joerg enlarge(struct parse *p, sopno size)
1956 1.1 jtc {
1957 1.39 christos sop *sp;
1958 1.39 christos
1959 1.14 lukem _DIAGASSERT(p != NULL);
1960 1.14 lukem
1961 1.1 jtc if (p->ssize >= size)
1962 1.30 christos return 1;
1963 1.1 jtc
1964 1.40 christos sp = reallocarray(p->strip, size, sizeof(*p->strip));
1965 1.39 christos if (sp == NULL) {
1966 1.1 jtc SETERROR(REG_ESPACE);
1967 1.30 christos return 0;
1968 1.1 jtc }
1969 1.39 christos p->strip = sp;
1970 1.35 joerg p->ssize = size;
1971 1.30 christos return 1;
1972 1.1 jtc }
1973 1.1 jtc
1974 1.1 jtc /*
1975 1.1 jtc - stripsnug - compact the strip
1976 1.9 perry == static void stripsnug(struct parse *p, struct re_guts *g);
1977 1.1 jtc */
1978 1.1 jtc static void
1979 1.39 christos stripsnug(struct parse *p, struct re_guts *g)
1980 1.1 jtc {
1981 1.14 lukem
1982 1.14 lukem _DIAGASSERT(p != NULL);
1983 1.14 lukem _DIAGASSERT(g != NULL);
1984 1.14 lukem
1985 1.1 jtc g->nstates = p->slen;
1986 1.40 christos g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1987 1.39 christos if (g->strip == NULL) {
1988 1.39 christos SETERROR(REG_ESPACE);
1989 1.39 christos g->strip = p->strip;
1990 1.39 christos }
1991 1.1 jtc }
1992 1.1 jtc
1993 1.1 jtc /*
1994 1.1 jtc - findmust - fill in must and mlen with longest mandatory literal string
1995 1.9 perry == static void findmust(struct parse *p, struct re_guts *g);
1996 1.1 jtc *
1997 1.1 jtc * This algorithm could do fancy things like analyzing the operands of |
1998 1.1 jtc * for common subsequences. Someday. This code is simple and finds most
1999 1.1 jtc * of the interesting cases.
2000 1.1 jtc *
2001 1.1 jtc * Note that must and mlen got initialized during setup.
2002 1.1 jtc */
2003 1.1 jtc static void
2004 1.39 christos findmust(struct parse *p, struct re_guts *g)
2005 1.1 jtc {
2006 1.9 perry sop *scan;
2007 1.7 christos sop *start = NULL;
2008 1.9 perry sop *newstart = NULL;
2009 1.9 perry sopno newlen;
2010 1.9 perry sop s;
2011 1.9 perry char *cp;
2012 1.39 christos int offset;
2013 1.39 christos mbstate_t mbs;
2014 1.1 jtc
2015 1.14 lukem _DIAGASSERT(p != NULL);
2016 1.14 lukem _DIAGASSERT(g != NULL);
2017 1.14 lukem
2018 1.1 jtc /* avoid making error situations worse */
2019 1.1 jtc if (p->error != 0)
2020 1.1 jtc return;
2021 1.1 jtc
2022 1.39 christos #ifdef notyet
2023 1.39 christos /*
2024 1.39 christos * It's not generally safe to do a ``char'' substring search on
2025 1.39 christos * multibyte character strings, but it's safe for at least
2026 1.39 christos * UTF-8 (see RFC 3629).
2027 1.39 christos */
2028 1.39 christos if (MB_CUR_MAX > 1 &&
2029 1.39 christos strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
2030 1.39 christos return;
2031 1.39 christos #endif
2032 1.39 christos
2033 1.1 jtc /* find the longest OCHAR sequence in strip */
2034 1.1 jtc newlen = 0;
2035 1.39 christos offset = 0;
2036 1.39 christos g->moffset = 0;
2037 1.1 jtc scan = g->strip + 1;
2038 1.1 jtc do {
2039 1.1 jtc s = *scan++;
2040 1.1 jtc switch (OP(s)) {
2041 1.1 jtc case OCHAR: /* sequence member */
2042 1.39 christos if (newlen == 0) { /* new sequence */
2043 1.39 christos memset(&mbs, 0, sizeof(mbs));
2044 1.1 jtc newstart = scan - 1;
2045 1.39 christos }
2046 1.42 christos #ifdef NLS
2047 1.42 christos char buf[MB_LEN_MAX];
2048 1.42 christos size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2049 1.39 christos if (clen == (size_t)-1)
2050 1.39 christos goto toohard;
2051 1.40 christos newlen += (sopno)clen;
2052 1.42 christos #else
2053 1.42 christos newlen++;
2054 1.42 christos #endif
2055 1.1 jtc break;
2056 1.1 jtc case OPLUS_: /* things that don't break one */
2057 1.1 jtc case OLPAREN:
2058 1.1 jtc case ORPAREN:
2059 1.1 jtc break;
2060 1.1 jtc case OQUEST_: /* things that must be skipped */
2061 1.1 jtc case OCH_:
2062 1.39 christos offset = altoffset(scan, offset);
2063 1.1 jtc scan--;
2064 1.1 jtc do {
2065 1.1 jtc scan += OPND(s);
2066 1.1 jtc s = *scan;
2067 1.1 jtc /* assert() interferes w debug printouts */
2068 1.40 christos if (OP(s) != O_QUEST &&
2069 1.40 christos OP(s) != O_CH && OP(s) != OOR2) {
2070 1.1 jtc g->iflags |= BAD;
2071 1.1 jtc return;
2072 1.1 jtc }
2073 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2074 1.11 christos /* FALLTHROUGH */
2075 1.39 christos case OBOW: /* things that break a sequence */
2076 1.39 christos case OEOW:
2077 1.39 christos case OBOL:
2078 1.39 christos case OEOL:
2079 1.39 christos case OBOS:
2080 1.39 christos case OEOS:
2081 1.39 christos case OWBND:
2082 1.39 christos case ONWBND:
2083 1.39 christos case O_QUEST:
2084 1.39 christos case O_CH:
2085 1.39 christos case OEND:
2086 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2087 1.1 jtc start = newstart;
2088 1.1 jtc g->mlen = newlen;
2089 1.39 christos if (offset > -1) {
2090 1.39 christos g->moffset += offset;
2091 1.39 christos offset = newlen;
2092 1.39 christos } else
2093 1.39 christos g->moffset = offset;
2094 1.39 christos } else {
2095 1.39 christos if (offset > -1)
2096 1.39 christos offset += newlen;
2097 1.1 jtc }
2098 1.1 jtc newlen = 0;
2099 1.1 jtc break;
2100 1.39 christos case OANY:
2101 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2102 1.39 christos start = newstart;
2103 1.39 christos g->mlen = newlen;
2104 1.39 christos if (offset > -1) {
2105 1.39 christos g->moffset += offset;
2106 1.39 christos offset = newlen;
2107 1.39 christos } else
2108 1.39 christos g->moffset = offset;
2109 1.39 christos } else {
2110 1.39 christos if (offset > -1)
2111 1.39 christos offset += newlen;
2112 1.39 christos }
2113 1.39 christos if (offset > -1)
2114 1.39 christos offset++;
2115 1.39 christos newlen = 0;
2116 1.39 christos break;
2117 1.39 christos case OANYOF: /* may or may not invalidate offset */
2118 1.39 christos /* First, everything as OANY */
2119 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2120 1.39 christos start = newstart;
2121 1.39 christos g->mlen = newlen;
2122 1.39 christos if (offset > -1) {
2123 1.39 christos g->moffset += offset;
2124 1.39 christos offset = newlen;
2125 1.39 christos } else
2126 1.39 christos g->moffset = offset;
2127 1.39 christos } else {
2128 1.39 christos if (offset > -1)
2129 1.39 christos offset += newlen;
2130 1.39 christos }
2131 1.39 christos if (offset > -1)
2132 1.39 christos offset++;
2133 1.39 christos newlen = 0;
2134 1.39 christos break;
2135 1.42 christos #ifdef NLS
2136 1.40 christos toohard:/*FALLTHROUGH*/
2137 1.42 christos #endif
2138 1.39 christos default:
2139 1.39 christos /* Anything here makes it impossible or too hard
2140 1.39 christos * to calculate the offset -- so we give up;
2141 1.39 christos * save the last known good offset, in case the
2142 1.39 christos * must sequence doesn't occur later.
2143 1.39 christos */
2144 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */
2145 1.39 christos start = newstart;
2146 1.39 christos g->mlen = newlen;
2147 1.39 christos if (offset > -1)
2148 1.39 christos g->moffset += offset;
2149 1.39 christos else
2150 1.39 christos g->moffset = offset;
2151 1.39 christos }
2152 1.39 christos offset = -1;
2153 1.39 christos newlen = 0;
2154 1.39 christos break;
2155 1.1 jtc }
2156 1.1 jtc } while (OP(s) != OEND);
2157 1.1 jtc
2158 1.39 christos if (g->mlen == 0) { /* there isn't one */
2159 1.39 christos g->moffset = -1;
2160 1.1 jtc return;
2161 1.39 christos }
2162 1.1 jtc
2163 1.1 jtc /* turn it into a character string */
2164 1.1 jtc g->must = malloc((size_t)g->mlen + 1);
2165 1.1 jtc if (g->must == NULL) { /* argh; just forget it */
2166 1.1 jtc g->mlen = 0;
2167 1.39 christos g->moffset = -1;
2168 1.1 jtc return;
2169 1.1 jtc }
2170 1.1 jtc cp = g->must;
2171 1.1 jtc scan = start;
2172 1.39 christos memset(&mbs, 0, sizeof(mbs));
2173 1.39 christos while (cp < g->must + g->mlen) {
2174 1.1 jtc while (OP(s = *scan++) != OCHAR)
2175 1.1 jtc continue;
2176 1.42 christos #ifdef NLS
2177 1.43 christos size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2178 1.39 christos assert(clen != (size_t)-1);
2179 1.39 christos cp += clen;
2180 1.42 christos #else
2181 1.42 christos *cp++ = OPND(s);
2182 1.42 christos #endif
2183 1.1 jtc }
2184 1.2 jtc assert(cp == g->must + g->mlen);
2185 1.1 jtc *cp++ = '\0'; /* just on general principles */
2186 1.1 jtc }
2187 1.1 jtc
2188 1.1 jtc /*
2189 1.39 christos - altoffset - choose biggest offset among multiple choices
2190 1.39 christos == static int altoffset(sop *scan, int offset);
2191 1.39 christos *
2192 1.39 christos * Compute, recursively if necessary, the largest offset among multiple
2193 1.39 christos * re paths.
2194 1.39 christos */
2195 1.39 christos static int
2196 1.39 christos altoffset(sop *scan, int offset)
2197 1.39 christos {
2198 1.39 christos int largest;
2199 1.39 christos int try;
2200 1.39 christos sop s;
2201 1.39 christos
2202 1.39 christos _DIAGASSERT(scan != NULL);
2203 1.39 christos
2204 1.39 christos /* If we gave up already on offsets, return */
2205 1.39 christos if (offset == -1)
2206 1.39 christos return -1;
2207 1.39 christos
2208 1.39 christos largest = 0;
2209 1.39 christos try = 0;
2210 1.39 christos s = *scan++;
2211 1.40 christos while (OP(s) != O_QUEST && OP(s) != O_CH) {
2212 1.39 christos switch (OP(s)) {
2213 1.39 christos case OOR1:
2214 1.39 christos if (try > largest)
2215 1.39 christos largest = try;
2216 1.39 christos try = 0;
2217 1.39 christos break;
2218 1.39 christos case OQUEST_:
2219 1.39 christos case OCH_:
2220 1.39 christos try = altoffset(scan, try);
2221 1.39 christos if (try == -1)
2222 1.39 christos return -1;
2223 1.39 christos scan--;
2224 1.39 christos do {
2225 1.39 christos scan += OPND(s);
2226 1.39 christos s = *scan;
2227 1.40 christos if (OP(s) != O_QUEST &&
2228 1.40 christos OP(s) != O_CH && OP(s) != OOR2)
2229 1.39 christos return -1;
2230 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH);
2231 1.39 christos /* We must skip to the next position, or we'll
2232 1.39 christos * leave altoffset() too early.
2233 1.39 christos */
2234 1.39 christos scan++;
2235 1.39 christos break;
2236 1.39 christos case OANYOF:
2237 1.39 christos case OCHAR:
2238 1.39 christos case OANY:
2239 1.39 christos try++;
2240 1.40 christos /*FALLTHROUGH*/
2241 1.39 christos case OBOW:
2242 1.39 christos case OEOW:
2243 1.39 christos case OWBND:
2244 1.39 christos case ONWBND:
2245 1.39 christos case OLPAREN:
2246 1.39 christos case ORPAREN:
2247 1.39 christos case OOR2:
2248 1.39 christos break;
2249 1.39 christos default:
2250 1.39 christos try = -1;
2251 1.39 christos break;
2252 1.39 christos }
2253 1.39 christos if (try == -1)
2254 1.39 christos return -1;
2255 1.39 christos s = *scan++;
2256 1.39 christos }
2257 1.39 christos
2258 1.39 christos if (try > largest)
2259 1.39 christos largest = try;
2260 1.39 christos
2261 1.39 christos return largest+offset;
2262 1.39 christos }
2263 1.39 christos
2264 1.39 christos /*
2265 1.39 christos - computejumps - compute char jumps for BM scan
2266 1.39 christos == static void computejumps(struct parse *p, struct re_guts *g);
2267 1.39 christos *
2268 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2269 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2270 1.39 christos * Sara Baase.
2271 1.39 christos *
2272 1.39 christos * A char jump is the number of characters one needs to jump based on
2273 1.39 christos * the value of the character from the text that was mismatched.
2274 1.39 christos */
2275 1.39 christos static void
2276 1.39 christos computejumps(struct parse *p, struct re_guts *g)
2277 1.39 christos {
2278 1.39 christos int ch;
2279 1.39 christos size_t mindex;
2280 1.39 christos
2281 1.39 christos _DIAGASSERT(p != NULL);
2282 1.39 christos _DIAGASSERT(g != NULL);
2283 1.39 christos
2284 1.39 christos /* Avoid making errors worse */
2285 1.39 christos if (p->error != 0)
2286 1.39 christos return;
2287 1.39 christos
2288 1.39 christos g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2289 1.39 christos if (g->charjump == NULL) /* Not a fatal error */
2290 1.39 christos return;
2291 1.39 christos /* Adjust for signed chars, if necessary */
2292 1.39 christos g->charjump = &g->charjump[-(CHAR_MIN)];
2293 1.39 christos
2294 1.39 christos /* If the character does not exist in the pattern, the jump
2295 1.39 christos * is equal to the number of characters in the pattern.
2296 1.39 christos */
2297 1.39 christos for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2298 1.39 christos g->charjump[ch] = g->mlen;
2299 1.39 christos
2300 1.39 christos /* If the character does exist, compute the jump that would
2301 1.39 christos * take us to the last character in the pattern equal to it
2302 1.39 christos * (notice that we match right to left, so that last character
2303 1.39 christos * is the first one that would be matched).
2304 1.39 christos */
2305 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2306 1.39 christos g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2307 1.39 christos }
2308 1.39 christos
2309 1.39 christos /*
2310 1.39 christos - computematchjumps - compute match jumps for BM scan
2311 1.39 christos == static void computematchjumps(struct parse *p, struct re_guts *g);
2312 1.39 christos *
2313 1.39 christos * This algorithm assumes g->must exists and is has size greater than
2314 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by
2315 1.39 christos * Sara Baase.
2316 1.39 christos *
2317 1.39 christos * A match jump is the number of characters one needs to advance based
2318 1.39 christos * on the already-matched suffix.
2319 1.39 christos * Notice that all values here are minus (g->mlen-1), because of the way
2320 1.39 christos * the search algorithm works.
2321 1.39 christos */
2322 1.39 christos static void
2323 1.39 christos computematchjumps(struct parse *p, struct re_guts *g)
2324 1.39 christos {
2325 1.39 christos size_t mindex; /* General "must" iterator */
2326 1.39 christos size_t suffix; /* Keeps track of matching suffix */
2327 1.39 christos size_t ssuffix; /* Keeps track of suffixes' suffix */
2328 1.39 christos size_t* pmatches; /* pmatches[k] points to the next i
2329 1.39 christos * such that i+1...mlen is a substring
2330 1.39 christos * of k+1...k+mlen-i-1
2331 1.39 christos */
2332 1.39 christos
2333 1.39 christos _DIAGASSERT(p != NULL);
2334 1.39 christos _DIAGASSERT(g != NULL);
2335 1.39 christos
2336 1.39 christos /* Avoid making errors worse */
2337 1.39 christos if (p->error != 0)
2338 1.39 christos return;
2339 1.39 christos
2340 1.39 christos pmatches = calloc(g->mlen, sizeof(*pmatches));
2341 1.39 christos if (pmatches == NULL) {
2342 1.39 christos g->matchjump = NULL;
2343 1.39 christos return;
2344 1.39 christos }
2345 1.39 christos
2346 1.39 christos g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2347 1.39 christos if (g->matchjump == NULL) { /* Not a fatal error */
2348 1.39 christos free(pmatches);
2349 1.39 christos return;
2350 1.39 christos }
2351 1.39 christos
2352 1.39 christos /* Set maximum possible jump for each character in the pattern */
2353 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++)
2354 1.39 christos g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2355 1.39 christos
2356 1.39 christos /* Compute pmatches[] */
2357 1.39 christos for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2358 1.39 christos pmatches[mindex] = suffix;
2359 1.39 christos
2360 1.39 christos /* If a mismatch is found, interrupting the substring,
2361 1.39 christos * compute the matchjump for that position. If no
2362 1.39 christos * mismatch is found, then a text substring mismatched
2363 1.39 christos * against the suffix will also mismatch against the
2364 1.39 christos * substring.
2365 1.39 christos */
2366 1.39 christos while (suffix < g->mlen
2367 1.39 christos && g->must[mindex] != g->must[suffix]) {
2368 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2369 1.39 christos g->mlen - mindex - 1);
2370 1.39 christos suffix = pmatches[suffix];
2371 1.39 christos }
2372 1.39 christos }
2373 1.39 christos
2374 1.39 christos /* Compute the matchjump up to the last substring found to jump
2375 1.39 christos * to the beginning of the largest must pattern prefix matching
2376 1.39 christos * it's own suffix.
2377 1.39 christos */
2378 1.39 christos for (mindex = 0; mindex <= suffix; mindex++)
2379 1.39 christos g->matchjump[mindex] = MIN(g->matchjump[mindex],
2380 1.39 christos g->mlen + suffix - mindex);
2381 1.39 christos
2382 1.39 christos ssuffix = pmatches[suffix];
2383 1.39 christos while (suffix < g->mlen) {
2384 1.39 christos while (suffix <= ssuffix && suffix < g->mlen) {
2385 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix],
2386 1.39 christos g->mlen + ssuffix - suffix);
2387 1.39 christos suffix++;
2388 1.39 christos }
2389 1.39 christos if (suffix < g->mlen)
2390 1.39 christos ssuffix = pmatches[ssuffix];
2391 1.39 christos }
2392 1.39 christos
2393 1.39 christos free(pmatches);
2394 1.39 christos }
2395 1.39 christos
2396 1.39 christos /*
2397 1.1 jtc - pluscount - count + nesting
2398 1.9 perry == static sopno pluscount(struct parse *p, struct re_guts *g);
2399 1.1 jtc */
2400 1.1 jtc static sopno /* nesting depth */
2401 1.39 christos pluscount(struct parse *p, struct re_guts *g)
2402 1.1 jtc {
2403 1.9 perry sop *scan;
2404 1.9 perry sop s;
2405 1.9 perry sopno plusnest = 0;
2406 1.9 perry sopno maxnest = 0;
2407 1.14 lukem
2408 1.14 lukem _DIAGASSERT(p != NULL);
2409 1.14 lukem _DIAGASSERT(g != NULL);
2410 1.1 jtc
2411 1.1 jtc if (p->error != 0)
2412 1.1 jtc return(0); /* there may not be an OEND */
2413 1.1 jtc
2414 1.1 jtc scan = g->strip + 1;
2415 1.1 jtc do {
2416 1.1 jtc s = *scan++;
2417 1.1 jtc switch (OP(s)) {
2418 1.1 jtc case OPLUS_:
2419 1.1 jtc plusnest++;
2420 1.1 jtc break;
2421 1.1 jtc case O_PLUS:
2422 1.1 jtc if (plusnest > maxnest)
2423 1.1 jtc maxnest = plusnest;
2424 1.1 jtc plusnest--;
2425 1.1 jtc break;
2426 1.1 jtc }
2427 1.1 jtc } while (OP(s) != OEND);
2428 1.1 jtc if (plusnest != 0)
2429 1.1 jtc g->iflags |= BAD;
2430 1.1 jtc return(maxnest);
2431 1.1 jtc }
2432