Home | History | Annotate | Line # | Download | only in regex
regcomp.c revision 1.42
      1  1.42  christos /*	$NetBSD: regcomp.c,v 1.42 2021/02/25 21:28:40 christos Exp $	*/
      2   1.6       cgd 
      3   1.5       cgd /*-
      4  1.39  christos  * SPDX-License-Identifier: BSD-3-Clause
      5  1.39  christos  *
      6  1.39  christos  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
      7   1.5       cgd  * Copyright (c) 1992, 1993, 1994
      8   1.5       cgd  *	The Regents of the University of California.  All rights reserved.
      9   1.5       cgd  *
     10  1.39  christos  * Copyright (c) 2011 The FreeBSD Foundation
     11  1.39  christos  * All rights reserved.
     12  1.39  christos  * Portions of this software were developed by David Chisnall
     13  1.39  christos  * under sponsorship from the FreeBSD Foundation.
     14  1.39  christos  *
     15   1.5       cgd  * This code is derived from software contributed to Berkeley by
     16   1.5       cgd  * Henry Spencer.
     17   1.5       cgd  *
     18   1.5       cgd  * Redistribution and use in source and binary forms, with or without
     19   1.5       cgd  * modification, are permitted provided that the following conditions
     20   1.5       cgd  * are met:
     21   1.5       cgd  * 1. Redistributions of source code must retain the above copyright
     22   1.5       cgd  *    notice, this list of conditions and the following disclaimer.
     23   1.5       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     24   1.5       cgd  *    notice, this list of conditions and the following disclaimer in the
     25   1.5       cgd  *    documentation and/or other materials provided with the distribution.
     26  1.18       agc  * 3. Neither the name of the University nor the names of its contributors
     27  1.18       agc  *    may be used to endorse or promote products derived from this software
     28  1.18       agc  *    without specific prior written permission.
     29  1.18       agc  *
     30  1.18       agc  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     31  1.18       agc  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     32  1.18       agc  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     33  1.18       agc  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     34  1.18       agc  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     35  1.18       agc  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     36  1.18       agc  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     37  1.18       agc  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     38  1.18       agc  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     39  1.18       agc  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     40  1.18       agc  * SUCH DAMAGE.
     41  1.18       agc  *
     42  1.18       agc  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
     43  1.18       agc  */
     44  1.18       agc 
     45   1.7  christos #include <sys/cdefs.h>
     46   1.6       cgd #if 0
     47   1.5       cgd static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
     48  1.39  christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
     49   1.6       cgd #endif
     50  1.42  christos __RCSID("$NetBSD: regcomp.c,v 1.42 2021/02/25 21:28:40 christos Exp $");
     51  1.39  christos 
     52  1.39  christos #define _OPENBSD_SOURCE
     53  1.42  christos 
     54  1.42  christos #ifndef LIBHACK
     55  1.39  christos #define REGEX_GNU_EXTENSIONS
     56   1.5       cgd 
     57   1.8       jtc #include "namespace.h"
     58  1.42  christos #endif
     59   1.1       jtc #include <sys/types.h>
     60  1.39  christos #include <stdio.h>
     61  1.39  christos #include <string.h>
     62   1.1       jtc #include <ctype.h>
     63   1.1       jtc #include <limits.h>
     64   1.1       jtc #include <stdlib.h>
     65  1.28  junyoung #include <regex.h>
     66  1.39  christos #include <stdbool.h>
     67   1.8       jtc 
     68  1.42  christos #if defined(_weak_alias) && !defined(LIBHACK)
     69  1.16   mycroft __weak_alias(regcomp,_regcomp)
     70   1.8       jtc #endif
     71   1.1       jtc 
     72  1.39  christos #ifdef REGEX_LIBC_COLLATE
     73  1.39  christos #include "collate.h"
     74  1.39  christos #endif
     75  1.39  christos 
     76   1.1       jtc #include "utils.h"
     77   1.1       jtc #include "regex2.h"
     78   1.1       jtc 
     79   1.1       jtc #include "cname.h"
     80   1.1       jtc 
     81   1.1       jtc /*
     82  1.39  christos  * Branching context, used to keep track of branch state for all of the branch-
     83  1.39  christos  * aware functions. In addition to keeping track of branch positions for the
     84  1.39  christos  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
     85  1.39  christos  * detection of whether ^ is acting as an anchor or being used erroneously and
     86  1.39  christos  * also for whether we're in a sub-expression or not.
     87  1.39  christos  */
     88  1.39  christos struct branchc {
     89  1.39  christos 	sopno start;
     90  1.39  christos 	sopno back;
     91  1.39  christos 	sopno fwd;
     92  1.39  christos 
     93  1.39  christos 	int nbranch;
     94  1.39  christos 	int nchain;
     95  1.39  christos 	bool outer;
     96  1.39  christos 	bool terminate;
     97  1.39  christos };
     98  1.39  christos 
     99  1.39  christos /*
    100   1.1       jtc  * parse structure, passed up and down to avoid global variables and
    101   1.1       jtc  * other clumsinesses
    102   1.1       jtc  */
    103   1.1       jtc struct parse {
    104  1.21      yamt 	const char *next;	/* next character in RE */
    105  1.21      yamt 	const char *end;	/* end of string (-> NUL normally) */
    106   1.1       jtc 	int error;		/* has an error been seen? */
    107  1.39  christos 	int gnuext;
    108   1.1       jtc 	sop *strip;		/* malloced strip */
    109   1.1       jtc 	sopno ssize;		/* malloced strip size (allocated) */
    110   1.1       jtc 	sopno slen;		/* malloced strip length (used) */
    111  1.33  christos 	size_t ncsalloc;	/* number of csets allocated */
    112   1.1       jtc 	struct re_guts *g;
    113   1.1       jtc #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
    114   1.1       jtc 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
    115   1.1       jtc 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
    116  1.39  christos 	bool allowbranch;	/* can this expression branch? */
    117  1.39  christos 	bool bre;		/* convenience; is this a BRE? */
    118  1.39  christos 	int pflags;		/* other parsing flags -- legacy escapes? */
    119  1.39  christos 	bool (*parse_expr)(struct parse *, struct branchc *);
    120  1.39  christos 	void (*pre_parse)(struct parse *, struct branchc *);
    121  1.39  christos 	void (*post_parse)(struct parse *, struct branchc *);
    122   1.1       jtc };
    123   1.1       jtc 
    124  1.39  christos #define PFLAG_LEGACY_ESC	0x00000001
    125  1.39  christos 
    126   1.5       cgd /* ========= begin header generated by ./mkh ========= */
    127   1.5       cgd #ifdef __cplusplus
    128   1.5       cgd extern "C" {
    129   1.5       cgd #endif
    130   1.5       cgd 
    131   1.5       cgd /* === regcomp.c === */
    132  1.39  christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
    133  1.26  junyoung static void p_str(struct parse *p);
    134  1.39  christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
    135  1.39  christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
    136  1.39  christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
    137  1.39  christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
    138  1.39  christos static bool p_branch_do(struct parse *p, struct branchc *bc);
    139  1.39  christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
    140  1.39  christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
    141  1.39  christos static void p_re(struct parse *p, int end1, int end2);
    142  1.39  christos static bool p_simp_re(struct parse *p, struct branchc *bc);
    143  1.26  junyoung static int p_count(struct parse *p);
    144  1.26  junyoung static void p_bracket(struct parse *p);
    145  1.39  christos static int p_range_cmp(wchar_t c1, wchar_t c2);
    146  1.26  junyoung static void p_b_term(struct parse *p, cset *cs);
    147  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    148  1.39  christos static int p_b_pseudoclass(struct parse *p, char c);
    149  1.39  christos #endif
    150  1.26  junyoung static void p_b_cclass(struct parse *p, cset *cs);
    151  1.39  christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
    152  1.26  junyoung static void p_b_eclass(struct parse *p, cset *cs);
    153  1.39  christos static wint_t p_b_symbol(struct parse *p);
    154  1.39  christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
    155  1.39  christos static bool may_escape(struct parse *p, const wint_t ch);
    156  1.39  christos static wint_t othercase(wint_t ch);
    157  1.39  christos static void bothcases(struct parse *p, wint_t ch);
    158  1.39  christos static void ordinary(struct parse *p, wint_t ch);
    159  1.26  junyoung static void nonnewline(struct parse *p);
    160  1.39  christos static void repeat(struct parse *p, sopno start, int from, int to);
    161  1.26  junyoung static int seterr(struct parse *p, int e);
    162  1.26  junyoung static cset *allocset(struct parse *p);
    163  1.26  junyoung static void freeset(struct parse *p, cset *cs);
    164  1.39  christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
    165  1.39  christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
    166  1.39  christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
    167  1.39  christos static wint_t singleton(cset *cs);
    168  1.26  junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
    169  1.39  christos static void doemit(struct parse *p, sop op, size_t opnd);
    170  1.39  christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
    171  1.39  christos static void dofwd(struct parse *p, sopno pos, sop value);
    172  1.30  christos static int enlarge(struct parse *p, sopno size);
    173  1.26  junyoung static void stripsnug(struct parse *p, struct re_guts *g);
    174  1.26  junyoung static void findmust(struct parse *p, struct re_guts *g);
    175  1.39  christos static int altoffset(sop *scan, int offset);
    176  1.39  christos static void computejumps(struct parse *p, struct re_guts *g);
    177  1.39  christos static void computematchjumps(struct parse *p, struct re_guts *g);
    178  1.26  junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
    179  1.39  christos static wint_t wgetnext(struct parse *p);
    180   1.5       cgd 
    181   1.5       cgd #ifdef __cplusplus
    182   1.5       cgd }
    183   1.5       cgd #endif
    184   1.5       cgd /* ========= end header generated by ./mkh ========= */
    185   1.1       jtc 
    186   1.1       jtc static char nuls[10];		/* place to point scanner in event of error */
    187   1.1       jtc 
    188   1.1       jtc /*
    189   1.1       jtc  * macros for use with parse structure
    190   1.1       jtc  * BEWARE:  these know that the parse structure is named `p' !!!
    191   1.1       jtc  */
    192   1.1       jtc #define	PEEK()	(*p->next)
    193   1.1       jtc #define	PEEK2()	(*(p->next+1))
    194   1.1       jtc #define	MORE()	(p->next < p->end)
    195   1.1       jtc #define	MORE2()	(p->next+1 < p->end)
    196   1.1       jtc #define	SEE(c)	(MORE() && PEEK() == (c))
    197   1.1       jtc #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
    198  1.39  christos #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
    199   1.1       jtc #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
    200   1.1       jtc #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
    201  1.39  christos #define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
    202   1.1       jtc #define	NEXT()	(p->next++)
    203   1.1       jtc #define	NEXT2()	(p->next += 2)
    204   1.1       jtc #define	NEXTn(n)	(p->next += (n))
    205   1.1       jtc #define	GETNEXT()	(*p->next++)
    206  1.39  christos #define	WGETNEXT()	wgetnext(p)
    207   1.1       jtc #define	SETERROR(e)	seterr(p, (e))
    208  1.39  christos #define	REQUIRE(co, e)	((co) || SETERROR(e))
    209   1.1       jtc #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
    210  1.39  christos #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
    211   1.1       jtc #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
    212  1.40  christos #define	EMIT(op, sopnd)	doemit(p, (op), (sopnd))
    213  1.40  christos #define	INSERT(op, pos)	doinsert(p, (op), HERE()-(pos)+1, pos)
    214  1.12  drochner #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
    215   1.1       jtc #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
    216   1.1       jtc #define	HERE()		(p->slen)
    217   1.1       jtc #define	THERE()		(p->slen - 1)
    218   1.4       jtc #define	THERETHERE()	(p->slen - 2)
    219   1.1       jtc #define	DROP(n)	(p->slen -= (n))
    220   1.1       jtc 
    221  1.39  christos /* Macro used by computejump()/computematchjump() */
    222  1.41  christos #ifndef MIN
    223  1.39  christos #define MIN(a,b)	((a)<(b)?(a):(b))
    224  1.41  christos #endif
    225  1.30  christos 
    226  1.42  christos #ifndef NLS
    227  1.42  christos static const struct {
    228  1.42  christos 	const char *name;
    229  1.42  christos 	int (*func)(int);
    230  1.42  christos } wctypes[] = {
    231  1.42  christos #define ADD(x) { .name = # x, .func = is ## x }
    232  1.42  christos 	ADD(alnum),
    233  1.42  christos 	ADD(alpha),
    234  1.42  christos 	ADD(blank),
    235  1.42  christos 	ADD(cntrl),
    236  1.42  christos 	ADD(digit),
    237  1.42  christos 	ADD(graph),
    238  1.42  christos 	ADD(lower),
    239  1.42  christos 	ADD(print),
    240  1.42  christos 	ADD(punct),
    241  1.42  christos 	ADD(space),
    242  1.42  christos 	ADD(upper),
    243  1.42  christos 	ADD(xdigit),
    244  1.42  christos #undef ADD
    245  1.42  christos };
    246  1.42  christos 
    247  1.42  christos wctype_t
    248  1.42  christos __regex_wctype(const char *str)
    249  1.42  christos {
    250  1.42  christos 	for (size_t i = 0; i < __arraycount(wctypes); i++) {
    251  1.42  christos 		if (strcmp(wctypes[i].name, str) == 0)
    252  1.42  christos 			return (wctype_t)(i + 1);
    253  1.42  christos 	}
    254  1.42  christos 	return (wctype_t)0;
    255  1.42  christos }
    256  1.42  christos 
    257  1.42  christos int
    258  1.42  christos __regex_iswctype(wint_t c, wctype_t ct)
    259  1.42  christos {
    260  1.42  christos 	if (ct == 0)
    261  1.42  christos 		return 0;
    262  1.42  christos 	return (*wctypes[ct - 1].func)(c);
    263  1.42  christos }
    264  1.42  christos #endif
    265  1.42  christos 
    266  1.39  christos static int				/* 0 success, otherwise REG_something */
    267  1.39  christos regcomp_internal(regex_t * __restrict preg,
    268  1.39  christos 	const char * __restrict pattern,
    269  1.39  christos 	int cflags, int pflags)
    270   1.1       jtc {
    271   1.1       jtc 	struct parse pa;
    272   1.9     perry 	struct re_guts *g;
    273   1.9     perry 	struct parse *p = &pa;
    274   1.9     perry 	int i;
    275   1.9     perry 	size_t len;
    276  1.39  christos 	size_t maxlen;
    277   1.3       jtc #ifdef REDEBUG
    278   1.3       jtc #	define	GOODFLAGS(f)	(f)
    279   1.3       jtc #else
    280   1.3       jtc #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
    281   1.3       jtc #endif
    282   1.1       jtc 
    283  1.14     lukem 	_DIAGASSERT(preg != NULL);
    284  1.14     lukem 	_DIAGASSERT(pattern != NULL);
    285  1.14     lukem 
    286   1.3       jtc 	cflags = GOODFLAGS(cflags);
    287   1.1       jtc 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
    288   1.1       jtc 		return(REG_INVARG);
    289   1.1       jtc 
    290   1.1       jtc 	if (cflags&REG_PEND) {
    291   1.1       jtc 		if (preg->re_endp < pattern)
    292   1.1       jtc 			return(REG_INVARG);
    293   1.1       jtc 		len = preg->re_endp - pattern;
    294   1.1       jtc 	} else
    295  1.11  christos 		len = strlen(pattern);
    296   1.1       jtc 
    297   1.1       jtc 	/* do the mallocs early so failure handling is easy */
    298  1.39  christos 	g = malloc(sizeof(*g));
    299   1.1       jtc 	if (g == NULL)
    300   1.1       jtc 		return(REG_ESPACE);
    301  1.39  christos 	/*
    302  1.39  christos 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
    303  1.39  christos 	 * extension.  Also avoid any signed overflow in case of conversion
    304  1.39  christos 	 * so make the real limit based on a 31-bit overflow.
    305  1.39  christos 	 *
    306  1.39  christos 	 * Likely not applicable on 64-bit systems but handle the case
    307  1.39  christos 	 * generically (who are we to stop people from using ~715MB+
    308  1.39  christos 	 * patterns?).
    309  1.39  christos 	 */
    310  1.40  christos 	maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
    311  1.39  christos 	if (len >= maxlen) {
    312  1.40  christos 		free(g);
    313  1.39  christos 		return(REG_ESPACE);
    314  1.39  christos 	}
    315  1.40  christos 	p->ssize = (sopno)(len / 2 * 3 + 1);	/* ugh */
    316  1.39  christos 	assert(p->ssize >= len);
    317  1.39  christos 
    318  1.39  christos 	p->strip = calloc(p->ssize, sizeof(*p->strip));
    319   1.1       jtc 	p->slen = 0;
    320   1.1       jtc 	if (p->strip == NULL) {
    321  1.40  christos 		free(g);
    322   1.1       jtc 		return(REG_ESPACE);
    323   1.1       jtc 	}
    324   1.1       jtc 
    325   1.1       jtc 	/* set things up */
    326   1.1       jtc 	p->g = g;
    327  1.39  christos 	p->next = pattern;	/* convenience; we do not modify it */
    328   1.1       jtc 	p->end = p->next + len;
    329   1.1       jtc 	p->error = 0;
    330   1.1       jtc 	p->ncsalloc = 0;
    331  1.39  christos 	p->pflags = pflags;
    332   1.1       jtc 	for (i = 0; i < NPAREN; i++) {
    333   1.1       jtc 		p->pbegin[i] = 0;
    334   1.1       jtc 		p->pend[i] = 0;
    335   1.1       jtc 	}
    336  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    337  1.39  christos 	if ((cflags & REG_GNU) == 0) {
    338  1.39  christos 		p->gnuext = false;
    339  1.39  christos 		p->allowbranch = (cflags & REG_EXTENDED) != 0;
    340  1.39  christos 	} else
    341  1.39  christos 		p->gnuext = p->allowbranch = true;
    342  1.39  christos #else
    343  1.39  christos 	p->gnuext = false;
    344  1.39  christos 	p->allowbranch = (cflags & REG_EXTENDED) != 0;
    345  1.39  christos #endif
    346  1.39  christos 	if (cflags & REG_EXTENDED) {
    347  1.39  christos 		p->bre = false;
    348  1.39  christos 		p->parse_expr = p_ere_exp;
    349  1.39  christos 		p->pre_parse = NULL;
    350  1.39  christos 		p->post_parse = NULL;
    351  1.39  christos 	} else {
    352  1.39  christos 		p->bre = true;
    353  1.39  christos 		p->parse_expr = p_simp_re;
    354  1.39  christos 		p->pre_parse = p_bre_pre_parse;
    355  1.39  christos 		p->post_parse = p_bre_post_parse;
    356  1.39  christos 	}
    357   1.1       jtc 	g->sets = NULL;
    358   1.1       jtc 	g->ncsets = 0;
    359   1.1       jtc 	g->cflags = cflags;
    360   1.1       jtc 	g->iflags = 0;
    361   1.1       jtc 	g->nbol = 0;
    362   1.1       jtc 	g->neol = 0;
    363   1.1       jtc 	g->must = NULL;
    364  1.39  christos 	g->moffset = -1;
    365  1.39  christos 	g->charjump = NULL;
    366  1.39  christos 	g->matchjump = NULL;
    367   1.1       jtc 	g->mlen = 0;
    368   1.1       jtc 	g->nsub = 0;
    369   1.1       jtc 	g->backrefs = 0;
    370   1.1       jtc 
    371   1.1       jtc 	/* do it */
    372   1.1       jtc 	EMIT(OEND, 0);
    373   1.1       jtc 	g->firststate = THERE();
    374  1.39  christos 	if (cflags & REG_NOSPEC)
    375   1.1       jtc 		p_str(p);
    376   1.1       jtc 	else
    377  1.39  christos 		p_re(p, OUT, OUT);
    378   1.1       jtc 	EMIT(OEND, 0);
    379   1.1       jtc 	g->laststate = THERE();
    380   1.1       jtc 
    381   1.1       jtc 	/* tidy up loose ends and fill things in */
    382   1.1       jtc 	stripsnug(p, g);
    383   1.1       jtc 	findmust(p, g);
    384  1.39  christos 	/* only use Boyer-Moore algorithm if the pattern is bigger
    385  1.39  christos 	 * than three characters
    386  1.39  christos 	 */
    387  1.39  christos 	if(g->mlen > 3) {
    388  1.39  christos 		computejumps(p, g);
    389  1.39  christos 		computematchjumps(p, g);
    390  1.39  christos 		if(g->matchjump == NULL && g->charjump != NULL) {
    391  1.39  christos 			free(g->charjump);
    392  1.39  christos 			g->charjump = NULL;
    393  1.39  christos 		}
    394  1.39  christos 	}
    395   1.1       jtc 	g->nplus = pluscount(p, g);
    396   1.1       jtc 	g->magic = MAGIC2;
    397   1.1       jtc 	preg->re_nsub = g->nsub;
    398   1.1       jtc 	preg->re_g = g;
    399   1.1       jtc 	preg->re_magic = MAGIC1;
    400   1.1       jtc #ifndef REDEBUG
    401   1.1       jtc 	/* not debugging, so can't rely on the assert() in regexec() */
    402   1.1       jtc 	if (g->iflags&BAD)
    403   1.1       jtc 		SETERROR(REG_ASSERT);
    404   1.1       jtc #endif
    405   1.1       jtc 
    406   1.1       jtc 	/* win or lose, we're done */
    407   1.1       jtc 	if (p->error != 0)	/* lose */
    408   1.1       jtc 		regfree(preg);
    409   1.1       jtc 	return(p->error);
    410   1.1       jtc }
    411   1.1       jtc 
    412   1.1       jtc /*
    413  1.39  christos  - regcomp - interface for parser and compilation
    414  1.39  christos  = extern int regcomp(regex_t *, const char *, int);
    415  1.39  christos  = #define	REG_BASIC	0000
    416  1.39  christos  = #define	REG_EXTENDED	0001
    417  1.39  christos  = #define	REG_ICASE	0002
    418  1.39  christos  = #define	REG_NOSUB	0004
    419  1.39  christos  = #define	REG_NEWLINE	0010
    420  1.39  christos  = #define	REG_NOSPEC	0020
    421  1.39  christos  = #define	REG_PEND	0040
    422  1.39  christos  = #define	REG_DUMP	0200
    423   1.1       jtc  */
    424  1.39  christos int				/* 0 success, otherwise REG_something */
    425  1.39  christos regcomp(regex_t * __restrict preg,
    426  1.39  christos 	const char * __restrict pattern,
    427  1.39  christos 	int cflags)
    428   1.1       jtc {
    429  1.30  christos 
    430  1.39  christos 	return (regcomp_internal(preg, pattern, cflags, 0));
    431   1.1       jtc }
    432   1.1       jtc 
    433   1.1       jtc /*
    434  1.39  christos  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
    435  1.39  christos  - return whether we should terminate or not
    436  1.39  christos  == static bool p_ere_exp(struct parse *p);
    437   1.1       jtc  */
    438  1.39  christos static bool
    439  1.39  christos p_ere_exp(struct parse *p, struct branchc *bc)
    440   1.1       jtc {
    441   1.9     perry 	char c;
    442  1.39  christos 	wint_t wc;
    443   1.9     perry 	sopno pos;
    444   1.9     perry 	int count;
    445   1.9     perry 	int count2;
    446  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    447  1.40  christos 	size_t i;
    448  1.39  christos 	int handled;
    449  1.39  christos #endif
    450   1.9     perry 	sopno subno;
    451   1.1       jtc 	int wascaret = 0;
    452   1.1       jtc 
    453  1.14     lukem 	_DIAGASSERT(p != NULL);
    454  1.14     lukem 
    455  1.39  christos 	(void)bc;
    456   1.1       jtc 	assert(MORE());		/* caller should have ensured this */
    457   1.1       jtc 	c = GETNEXT();
    458   1.1       jtc 
    459  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    460  1.39  christos 	handled = 0;
    461  1.39  christos #endif
    462   1.1       jtc 	pos = HERE();
    463   1.1       jtc 	switch (c) {
    464   1.1       jtc 	case '(':
    465  1.39  christos 		(void)REQUIRE(MORE(), REG_EPAREN);
    466   1.1       jtc 		p->g->nsub++;
    467  1.40  christos 		subno = (sopno)p->g->nsub;
    468   1.1       jtc 		if (subno < NPAREN)
    469   1.1       jtc 			p->pbegin[subno] = HERE();
    470   1.1       jtc 		EMIT(OLPAREN, subno);
    471   1.1       jtc 		if (!SEE(')'))
    472  1.39  christos 			p_re(p, ')', IGN);
    473   1.1       jtc 		if (subno < NPAREN) {
    474   1.1       jtc 			p->pend[subno] = HERE();
    475   1.1       jtc 			assert(p->pend[subno] != 0);
    476   1.1       jtc 		}
    477   1.1       jtc 		EMIT(ORPAREN, subno);
    478  1.39  christos 		(void)MUSTEAT(')', REG_EPAREN);
    479   1.1       jtc 		break;
    480   1.1       jtc #ifndef POSIX_MISTAKE
    481   1.1       jtc 	case ')':		/* happens only if no current unmatched ( */
    482   1.1       jtc 		/*
    483   1.1       jtc 		 * You may ask, why the ifndef?  Because I didn't notice
    484   1.1       jtc 		 * this until slightly too late for 1003.2, and none of the
    485   1.1       jtc 		 * other 1003.2 regular-expression reviewers noticed it at
    486   1.1       jtc 		 * all.  So an unmatched ) is legal POSIX, at least until
    487   1.1       jtc 		 * we can get it fixed.
    488   1.1       jtc 		 */
    489   1.1       jtc 		SETERROR(REG_EPAREN);
    490   1.1       jtc 		break;
    491   1.1       jtc #endif
    492   1.1       jtc 	case '^':
    493   1.1       jtc 		EMIT(OBOL, 0);
    494   1.1       jtc 		p->g->iflags |= USEBOL;
    495   1.1       jtc 		p->g->nbol++;
    496   1.1       jtc 		wascaret = 1;
    497   1.1       jtc 		break;
    498   1.1       jtc 	case '$':
    499   1.1       jtc 		EMIT(OEOL, 0);
    500   1.1       jtc 		p->g->iflags |= USEEOL;
    501   1.1       jtc 		p->g->neol++;
    502   1.1       jtc 		break;
    503   1.1       jtc 	case '|':
    504   1.1       jtc 		SETERROR(REG_EMPTY);
    505   1.1       jtc 		break;
    506   1.1       jtc 	case '*':
    507   1.1       jtc 	case '+':
    508   1.1       jtc 	case '?':
    509  1.39  christos 	case '{':
    510   1.1       jtc 		SETERROR(REG_BADRPT);
    511   1.1       jtc 		break;
    512   1.1       jtc 	case '.':
    513   1.1       jtc 		if (p->g->cflags&REG_NEWLINE)
    514   1.1       jtc 			nonnewline(p);
    515   1.1       jtc 		else
    516   1.1       jtc 			EMIT(OANY, 0);
    517   1.1       jtc 		break;
    518   1.1       jtc 	case '[':
    519   1.1       jtc 		p_bracket(p);
    520   1.1       jtc 		break;
    521   1.1       jtc 	case '\\':
    522  1.39  christos 		(void)REQUIRE(MORE(), REG_EESCAPE);
    523  1.39  christos 		wc = WGETNEXT();
    524  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    525  1.39  christos 		if (p->gnuext) {
    526  1.39  christos 			handled = 1;
    527  1.39  christos 			switch (wc) {
    528  1.39  christos 			case '`':
    529  1.39  christos 				EMIT(OBOS, 0);
    530  1.39  christos 				break;
    531  1.39  christos 			case '\'':
    532  1.39  christos 				EMIT(OEOS, 0);
    533  1.39  christos 				break;
    534  1.39  christos 			case 'B':
    535  1.39  christos 				EMIT(ONWBND, 0);
    536  1.39  christos 				break;
    537  1.39  christos 			case 'b':
    538  1.39  christos 				EMIT(OWBND, 0);
    539  1.39  christos 				break;
    540  1.39  christos 			case 'W':
    541  1.39  christos 			case 'w':
    542  1.39  christos 			case 'S':
    543  1.39  christos 			case 's':
    544  1.39  christos 				p_b_pseudoclass(p, wc);
    545  1.39  christos 				break;
    546  1.39  christos 			case '1':
    547  1.39  christos 			case '2':
    548  1.39  christos 			case '3':
    549  1.39  christos 			case '4':
    550  1.39  christos 			case '5':
    551  1.39  christos 			case '6':
    552  1.39  christos 			case '7':
    553  1.39  christos 			case '8':
    554  1.39  christos 			case '9':
    555  1.39  christos 				i = wc - '0';
    556  1.39  christos 				assert(i < NPAREN);
    557  1.39  christos 				if (p->pend[i] != 0) {
    558  1.39  christos 					assert(i <= p->g->nsub);
    559  1.39  christos 					EMIT(OBACK_, i);
    560  1.39  christos 					assert(p->pbegin[i] != 0);
    561  1.39  christos 					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
    562  1.39  christos 					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
    563  1.39  christos 					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
    564  1.39  christos 					EMIT(O_BACK, i);
    565  1.39  christos 				} else
    566  1.39  christos 					SETERROR(REG_ESUBREG);
    567  1.39  christos 				p->g->backrefs = 1;
    568  1.39  christos 				break;
    569  1.39  christos 			default:
    570  1.39  christos 				handled = 0;
    571  1.39  christos 			}
    572  1.39  christos 			/* Don't proceed to the POSIX bits if we've already handled it */
    573  1.39  christos 			if (handled)
    574  1.39  christos 				break;
    575  1.39  christos 		}
    576  1.39  christos #endif
    577  1.39  christos 		switch (wc) {
    578  1.39  christos 		case '<':
    579  1.39  christos 			EMIT(OBOW, 0);
    580  1.39  christos 			break;
    581  1.39  christos 		case '>':
    582  1.39  christos 			EMIT(OEOW, 0);
    583  1.39  christos 			break;
    584  1.39  christos 		default:
    585  1.39  christos 			if (may_escape(p, wc))
    586  1.39  christos 				ordinary(p, wc);
    587  1.39  christos 			else
    588  1.39  christos 				SETERROR(REG_EESCAPE);
    589  1.39  christos 			break;
    590  1.39  christos 		}
    591   1.1       jtc 		break;
    592   1.1       jtc 	default:
    593  1.38  christos 		if (p->error != 0)
    594  1.39  christos 			return (false);
    595  1.39  christos 		p->next--;
    596  1.39  christos 		wc = WGETNEXT();
    597  1.39  christos 		ordinary(p, wc);
    598   1.1       jtc 		break;
    599   1.1       jtc 	}
    600   1.1       jtc 
    601   1.1       jtc 	if (!MORE())
    602  1.39  christos 		return (false);
    603   1.1       jtc 	c = PEEK();
    604   1.1       jtc 	/* we call { a repetition if followed by a digit */
    605  1.39  christos 	if (!( c == '*' || c == '+' || c == '?' || c == '{'))
    606  1.39  christos 		return (false);		/* no repetition, we're done */
    607  1.39  christos 	else if (c == '{')
    608  1.39  christos 		(void)REQUIRE(MORE2() && \
    609  1.39  christos 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
    610   1.1       jtc 	NEXT();
    611   1.1       jtc 
    612  1.39  christos 	(void)REQUIRE(!wascaret, REG_BADRPT);
    613   1.1       jtc 	switch (c) {
    614   1.1       jtc 	case '*':	/* implemented as +? */
    615   1.4       jtc 		/* this case does not require the (y|) trick, noKLUDGE */
    616   1.1       jtc 		INSERT(OPLUS_, pos);
    617   1.1       jtc 		ASTERN(O_PLUS, pos);
    618   1.1       jtc 		INSERT(OQUEST_, pos);
    619   1.1       jtc 		ASTERN(O_QUEST, pos);
    620   1.1       jtc 		break;
    621   1.1       jtc 	case '+':
    622   1.1       jtc 		INSERT(OPLUS_, pos);
    623   1.1       jtc 		ASTERN(O_PLUS, pos);
    624   1.1       jtc 		break;
    625   1.1       jtc 	case '?':
    626   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
    627   1.4       jtc 		INSERT(OCH_, pos);		/* offset slightly wrong */
    628   1.4       jtc 		ASTERN(OOR1, pos);		/* this one's right */
    629   1.4       jtc 		AHEAD(pos);			/* fix the OCH_ */
    630   1.4       jtc 		EMIT(OOR2, 0);			/* offset very wrong... */
    631   1.4       jtc 		AHEAD(THERE());			/* ...so fix it */
    632   1.4       jtc 		ASTERN(O_CH, THERETHERE());
    633   1.1       jtc 		break;
    634   1.1       jtc 	case '{':
    635   1.1       jtc 		count = p_count(p);
    636   1.1       jtc 		if (EAT(',')) {
    637  1.39  christos 			if (isdigit((uch)PEEK())) {
    638   1.1       jtc 				count2 = p_count(p);
    639  1.39  christos 				(void)REQUIRE(count <= count2, REG_BADBR);
    640   1.1       jtc 			} else		/* single number with comma */
    641   1.1       jtc 				count2 = INFINITY;
    642   1.1       jtc 		} else		/* just a single number */
    643   1.1       jtc 			count2 = count;
    644  1.39  christos 		repeat(p, pos, count, count2);
    645   1.1       jtc 		if (!EAT('}')) {	/* error heuristics */
    646   1.1       jtc 			while (MORE() && PEEK() != '}')
    647   1.1       jtc 				NEXT();
    648  1.39  christos 			(void)REQUIRE(MORE(), REG_EBRACE);
    649   1.1       jtc 			SETERROR(REG_BADBR);
    650   1.1       jtc 		}
    651   1.1       jtc 		break;
    652   1.1       jtc 	}
    653   1.1       jtc 
    654   1.1       jtc 	if (!MORE())
    655  1.39  christos 		return (false);
    656   1.1       jtc 	c = PEEK();
    657   1.1       jtc 	if (!( c == '*' || c == '+' || c == '?' ||
    658  1.39  christos 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
    659  1.39  christos 		return (false);
    660   1.1       jtc 	SETERROR(REG_BADRPT);
    661  1.39  christos 	return (false);
    662   1.1       jtc }
    663   1.1       jtc 
    664   1.1       jtc /*
    665   1.1       jtc  - p_str - string (no metacharacters) "parser"
    666   1.9     perry  == static void p_str(struct parse *p);
    667   1.1       jtc  */
    668   1.1       jtc static void
    669  1.39  christos p_str(struct parse *p)
    670   1.1       jtc {
    671  1.39  christos 	(void)REQUIRE(MORE(), REG_EMPTY);
    672  1.39  christos 	while (MORE())
    673  1.39  christos 		ordinary(p, WGETNEXT());
    674  1.39  christos }
    675  1.14     lukem 
    676  1.39  christos /*
    677  1.39  christos  * Eat consecutive branch delimiters for the kind of expression that we are
    678  1.39  christos  * parsing, return the number of delimiters that we ate.
    679  1.39  christos  */
    680  1.39  christos static int
    681  1.39  christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
    682  1.39  christos {
    683  1.39  christos 	int nskip;
    684  1.14     lukem 
    685  1.39  christos 	(void)bc;
    686  1.39  christos 	nskip = 0;
    687  1.39  christos 	while (EATSPEC('|'))
    688  1.39  christos 		++nskip;
    689  1.39  christos 	return (nskip);
    690   1.1       jtc }
    691   1.1       jtc 
    692   1.1       jtc /*
    693  1.39  christos  * Insert necessary branch book-keeping operations. This emits a
    694  1.39  christos  * bogus 'next' offset, since we still have more to parse
    695   1.1       jtc  */
    696   1.1       jtc static void
    697  1.39  christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
    698   1.9     perry {
    699   1.1       jtc 
    700  1.39  christos 	if (bc->nbranch == 0) {
    701  1.39  christos 		INSERT(OCH_, bc->start);	/* offset is wrong */
    702  1.39  christos 		bc->fwd = bc->start;
    703  1.39  christos 		bc->back = bc->start;
    704  1.39  christos 	}
    705  1.39  christos 
    706  1.39  christos 	ASTERN(OOR1, bc->back);
    707  1.39  christos 	bc->back = THERE();
    708  1.39  christos 	AHEAD(bc->fwd);			/* fix previous offset */
    709  1.39  christos 	bc->fwd = HERE();
    710  1.39  christos 	EMIT(OOR2, 0);			/* offset is very wrong */
    711  1.39  christos 	++bc->nbranch;
    712  1.39  christos }
    713  1.39  christos 
    714  1.39  christos /*
    715  1.39  christos  * Fix the offset of the tail branch, if we actually had any branches.
    716  1.39  christos  * This is to correct the bogus placeholder offset that we use.
    717  1.39  christos  */
    718  1.39  christos static void
    719  1.39  christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
    720  1.39  christos {
    721  1.14     lukem 
    722  1.39  christos 	/* Fix bogus offset at the tail if we actually have branches */
    723  1.39  christos 	if (bc->nbranch > 0) {
    724  1.39  christos 		AHEAD(bc->fwd);
    725  1.39  christos 		ASTERN(O_CH, bc->back);
    726  1.30  christos 	}
    727  1.39  christos }
    728  1.39  christos 
    729  1.39  christos /*
    730  1.39  christos  * Signal to the parser that an empty branch has been encountered; this will,
    731  1.39  christos  * in the future, be used to allow for more permissive behavior with empty
    732  1.39  christos  * branches. The return value should indicate whether parsing may continue
    733  1.39  christos  * or not.
    734  1.39  christos  */
    735  1.39  christos static bool
    736  1.39  christos p_branch_empty(struct parse *p, struct branchc *bc)
    737  1.39  christos {
    738  1.39  christos 
    739  1.39  christos 	(void)bc;
    740  1.39  christos 	SETERROR(REG_EMPTY);
    741  1.39  christos 	return (false);
    742  1.39  christos }
    743  1.39  christos 
    744  1.39  christos /*
    745  1.39  christos  * Take care of any branching requirements. This includes inserting the
    746  1.39  christos  * appropriate branching instructions as well as eating all of the branch
    747  1.39  christos  * delimiters until we either run out of pattern or need to parse more pattern.
    748  1.39  christos  */
    749  1.39  christos static bool
    750  1.39  christos p_branch_do(struct parse *p, struct branchc *bc)
    751  1.39  christos {
    752  1.39  christos 	int ate = 0;
    753  1.39  christos 
    754  1.39  christos 	ate = p_branch_eat_delim(p, bc);
    755  1.39  christos 	if (ate == 0)
    756  1.39  christos 		return (false);
    757  1.39  christos 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
    758  1.39  christos 		/*
    759  1.39  christos 		 * Halt parsing only if we have an empty branch and p_branch_empty
    760  1.39  christos 		 * indicates that we must not continue. In the future, this will not
    761  1.39  christos 		 * necessarily be an error.
    762  1.39  christos 		 */
    763  1.39  christos 		return (false);
    764  1.39  christos 	p_branch_ins_offset(p, bc);
    765  1.39  christos 
    766  1.39  christos 	return (true);
    767  1.39  christos }
    768  1.30  christos 
    769  1.39  christos static void
    770  1.39  christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
    771  1.39  christos {
    772  1.14     lukem 
    773  1.40  christos 	(void)bc;
    774  1.39  christos 	/*
    775  1.39  christos 	 * Does not move cleanly into expression parser because of
    776  1.39  christos 	 * ordinary interpration of * at the beginning position of
    777  1.39  christos 	 * an expression.
    778  1.39  christos 	 */
    779   1.1       jtc 	if (EAT('^')) {
    780   1.1       jtc 		EMIT(OBOL, 0);
    781   1.1       jtc 		p->g->iflags |= USEBOL;
    782   1.1       jtc 		p->g->nbol++;
    783   1.1       jtc 	}
    784  1.39  christos }
    785  1.39  christos 
    786  1.39  christos static void
    787  1.39  christos p_bre_post_parse(struct parse *p, struct branchc *bc)
    788  1.39  christos {
    789  1.39  christos 
    790  1.39  christos 	/* Expression is terminating due to EOL token */
    791  1.39  christos 	if (bc->terminate) {
    792   1.1       jtc 		DROP(1);
    793   1.1       jtc 		EMIT(OEOL, 0);
    794   1.1       jtc 		p->g->iflags |= USEEOL;
    795   1.1       jtc 		p->g->neol++;
    796   1.1       jtc 	}
    797  1.39  christos }
    798   1.1       jtc 
    799  1.39  christos /*
    800  1.39  christos  - p_re - Top level parser, concatenation and BRE anchoring
    801  1.39  christos  == static void p_re(struct parse *p, int end1, int end2);
    802  1.39  christos  * Giving end1 as OUT essentially eliminates the end1/end2 check.
    803  1.39  christos  *
    804  1.39  christos  * This implementation is a bit of a kludge, in that a trailing $ is first
    805  1.39  christos  * taken as an ordinary character and then revised to be an anchor.
    806  1.39  christos  * The amount of lookahead needed to avoid this kludge is excessive.
    807  1.39  christos  */
    808  1.39  christos static void
    809  1.39  christos p_re(struct parse *p,
    810  1.39  christos 	int end1,	/* first terminating character */
    811  1.39  christos 	int end2)	/* second terminating character; ignored for EREs */
    812  1.39  christos {
    813  1.39  christos 	struct branchc bc;
    814  1.39  christos 
    815  1.39  christos 	bc.nbranch = 0;
    816  1.39  christos 	if (end1 == OUT && end2 == OUT)
    817  1.39  christos 		bc.outer = true;
    818  1.39  christos 	else
    819  1.39  christos 		bc.outer = false;
    820  1.39  christos #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
    821  1.39  christos 	for (;;) {
    822  1.39  christos 		bc.start = HERE();
    823  1.39  christos 		bc.nchain = 0;
    824  1.39  christos 		bc.terminate = false;
    825  1.39  christos 		if (p->pre_parse != NULL)
    826  1.39  christos 			p->pre_parse(p, &bc);
    827  1.39  christos 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
    828  1.39  christos 			bc.terminate = p->parse_expr(p, &bc);
    829  1.39  christos 			++bc.nchain;
    830  1.39  christos 		}
    831  1.39  christos 		if (p->post_parse != NULL)
    832  1.39  christos 			p->post_parse(p, &bc);
    833  1.39  christos 		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
    834  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    835  1.39  christos 		if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
    836  1.39  christos 			break;
    837  1.39  christos #endif
    838  1.39  christos 		if (!p->allowbranch)
    839  1.39  christos 			break;
    840  1.39  christos 		/*
    841  1.39  christos 		 * p_branch_do's return value indicates whether we should
    842  1.39  christos 		 * continue parsing or not. This is both for correctness and
    843  1.39  christos 		 * a slight optimization, because it will check if we've
    844  1.39  christos 		 * encountered an empty branch or the end of the string
    845  1.39  christos 		 * immediately following a branch delimiter.
    846  1.39  christos 		 */
    847  1.39  christos 		if (!p_branch_do(p, &bc))
    848  1.39  christos 			break;
    849  1.39  christos 	}
    850  1.39  christos #undef SEE_END
    851  1.39  christos 	if (p->allowbranch)
    852  1.39  christos 		p_branch_fix_tail(p, &bc);
    853  1.39  christos 	assert(!MORE() || SEE(end1));
    854   1.1       jtc }
    855   1.1       jtc 
    856   1.1       jtc /*
    857   1.1       jtc  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
    858  1.39  christos  == static bool p_simp_re(struct parse *p, struct branchc *bc);
    859   1.1       jtc  */
    860  1.39  christos static bool			/* was the simple RE an unbackslashed $? */
    861  1.39  christos p_simp_re(struct parse *p, struct branchc *bc)
    862   1.1       jtc {
    863   1.9     perry 	int c;
    864  1.39  christos 	int cc;			/* convenient/control character */
    865   1.9     perry 	int count;
    866   1.9     perry 	int count2;
    867  1.39  christos 	sopno pos;
    868  1.39  christos 	bool handled;
    869  1.40  christos 	size_t i;
    870  1.39  christos 	wint_t wc;
    871   1.9     perry 	sopno subno;
    872   1.1       jtc #	define	BACKSL	(1<<CHAR_BIT)
    873   1.1       jtc 
    874  1.39  christos 	pos = HERE();		/* repetition op, if any, covers from here */
    875  1.39  christos 	handled = false;
    876   1.1       jtc 
    877   1.1       jtc 	assert(MORE());		/* caller should have ensured this */
    878   1.1       jtc 	c = GETNEXT();
    879   1.1       jtc 	if (c == '\\') {
    880  1.39  christos 		(void)REQUIRE(MORE(), REG_EESCAPE);
    881  1.39  christos 		cc = GETNEXT();
    882  1.39  christos 		c = BACKSL | cc;
    883  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    884  1.39  christos 		if (p->gnuext) {
    885  1.39  christos 			handled = true;
    886  1.39  christos 			switch (c) {
    887  1.39  christos 			case BACKSL|'`':
    888  1.39  christos 				EMIT(OBOS, 0);
    889  1.39  christos 				break;
    890  1.39  christos 			case BACKSL|'\'':
    891  1.39  christos 				EMIT(OEOS, 0);
    892  1.39  christos 				break;
    893  1.39  christos 			case BACKSL|'B':
    894  1.39  christos 				EMIT(ONWBND, 0);
    895  1.39  christos 				break;
    896  1.39  christos 			case BACKSL|'b':
    897  1.39  christos 				EMIT(OWBND, 0);
    898  1.39  christos 				break;
    899  1.39  christos 			case BACKSL|'W':
    900  1.39  christos 			case BACKSL|'w':
    901  1.39  christos 			case BACKSL|'S':
    902  1.39  christos 			case BACKSL|'s':
    903  1.39  christos 				p_b_pseudoclass(p, cc);
    904  1.39  christos 				break;
    905  1.39  christos 			default:
    906  1.39  christos 				handled = false;
    907  1.39  christos 			}
    908  1.39  christos 		}
    909  1.39  christos #endif
    910   1.1       jtc 	}
    911  1.39  christos 	if (!handled) {
    912  1.39  christos 		switch (c) {
    913  1.39  christos 		case '.':
    914  1.39  christos 			if (p->g->cflags&REG_NEWLINE)
    915  1.39  christos 				nonnewline(p);
    916  1.39  christos 			else
    917  1.39  christos 				EMIT(OANY, 0);
    918  1.39  christos 			break;
    919  1.39  christos 		case '[':
    920  1.39  christos 			p_bracket(p);
    921  1.39  christos 			break;
    922  1.39  christos 		case BACKSL|'<':
    923  1.39  christos 			EMIT(OBOW, 0);
    924  1.39  christos 			break;
    925  1.39  christos 		case BACKSL|'>':
    926  1.39  christos 			EMIT(OEOW, 0);
    927  1.39  christos 			break;
    928  1.39  christos 		case BACKSL|'{':
    929  1.39  christos 			SETERROR(REG_BADRPT);
    930  1.39  christos 			break;
    931  1.39  christos 		case BACKSL|'(':
    932  1.39  christos 			p->g->nsub++;
    933  1.40  christos 			subno = (sopno)p->g->nsub;
    934  1.39  christos 			if (subno < NPAREN)
    935  1.39  christos 				p->pbegin[subno] = HERE();
    936  1.39  christos 			EMIT(OLPAREN, subno);
    937  1.39  christos 			/* the MORE here is an error heuristic */
    938  1.39  christos 			if (MORE() && !SEETWO('\\', ')'))
    939  1.39  christos 				p_re(p, '\\', ')');
    940  1.39  christos 			if (subno < NPAREN) {
    941  1.39  christos 				p->pend[subno] = HERE();
    942  1.39  christos 				assert(p->pend[subno] != 0);
    943  1.39  christos 			}
    944  1.39  christos 			EMIT(ORPAREN, subno);
    945  1.39  christos 			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
    946  1.39  christos 			break;
    947  1.39  christos 		case BACKSL|')':	/* should not get here -- must be user */
    948  1.39  christos 			SETERROR(REG_EPAREN);
    949  1.39  christos 			break;
    950  1.39  christos 		case BACKSL|'1':
    951  1.39  christos 		case BACKSL|'2':
    952  1.39  christos 		case BACKSL|'3':
    953  1.39  christos 		case BACKSL|'4':
    954  1.39  christos 		case BACKSL|'5':
    955  1.39  christos 		case BACKSL|'6':
    956  1.39  christos 		case BACKSL|'7':
    957  1.39  christos 		case BACKSL|'8':
    958  1.39  christos 		case BACKSL|'9':
    959  1.39  christos 			i = (c&~BACKSL) - '0';
    960  1.39  christos 			assert(i < NPAREN);
    961  1.39  christos 			if (p->pend[i] != 0) {
    962  1.39  christos 				assert(i <= p->g->nsub);
    963  1.39  christos 				EMIT(OBACK_, i);
    964  1.39  christos 				assert(p->pbegin[i] != 0);
    965  1.39  christos 				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
    966  1.39  christos 				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
    967  1.39  christos 				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
    968  1.39  christos 				EMIT(O_BACK, i);
    969  1.39  christos 			} else
    970  1.39  christos 				SETERROR(REG_ESUBREG);
    971  1.39  christos 			p->g->backrefs = 1;
    972  1.39  christos 			break;
    973  1.39  christos 		case '*':
    974  1.39  christos 			/*
    975  1.39  christos 			 * Ordinary if used as the first character beyond BOL anchor of
    976  1.39  christos 			 * a (sub-)expression, counts as a bad repetition operator if it
    977  1.39  christos 			 * appears otherwise.
    978  1.39  christos 			 */
    979  1.39  christos 			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
    980  1.39  christos 			/* FALLTHROUGH */
    981  1.39  christos 		default:
    982  1.39  christos 			if (p->error != 0)
    983  1.39  christos 				return (false);	/* Definitely not $... */
    984  1.39  christos 			p->next--;
    985  1.39  christos 			wc = WGETNEXT();
    986  1.39  christos 			if ((c & BACKSL) == 0 || may_escape(p, wc))
    987  1.39  christos 				ordinary(p, wc);
    988  1.39  christos 			else
    989  1.39  christos 				SETERROR(REG_EESCAPE);
    990  1.39  christos 			break;
    991   1.1       jtc 		}
    992   1.1       jtc 	}
    993   1.1       jtc 
    994   1.1       jtc 	if (EAT('*')) {		/* implemented as +? */
    995   1.4       jtc 		/* this case does not require the (y|) trick, noKLUDGE */
    996   1.1       jtc 		INSERT(OPLUS_, pos);
    997   1.1       jtc 		ASTERN(O_PLUS, pos);
    998   1.1       jtc 		INSERT(OQUEST_, pos);
    999   1.1       jtc 		ASTERN(O_QUEST, pos);
   1000  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
   1001  1.39  christos 	} else if (p->gnuext && EATTWO('\\', '?')) {
   1002  1.39  christos 		INSERT(OQUEST_, pos);
   1003  1.39  christos 		ASTERN(O_QUEST, pos);
   1004  1.39  christos 	} else if (p->gnuext && EATTWO('\\', '+')) {
   1005  1.39  christos 		INSERT(OPLUS_, pos);
   1006  1.39  christos 		ASTERN(O_PLUS, pos);
   1007  1.39  christos #endif
   1008   1.1       jtc 	} else if (EATTWO('\\', '{')) {
   1009   1.1       jtc 		count = p_count(p);
   1010   1.1       jtc 		if (EAT(',')) {
   1011  1.39  christos 			if (MORE() && isdigit((uch)PEEK())) {
   1012   1.1       jtc 				count2 = p_count(p);
   1013  1.39  christos 				(void)REQUIRE(count <= count2, REG_BADBR);
   1014   1.1       jtc 			} else		/* single number with comma */
   1015   1.1       jtc 				count2 = INFINITY;
   1016   1.1       jtc 		} else		/* just a single number */
   1017   1.1       jtc 			count2 = count;
   1018  1.39  christos 		repeat(p, pos, count, count2);
   1019   1.1       jtc 		if (!EATTWO('\\', '}')) {	/* error heuristics */
   1020   1.1       jtc 			while (MORE() && !SEETWO('\\', '}'))
   1021   1.1       jtc 				NEXT();
   1022  1.39  christos 			(void)REQUIRE(MORE(), REG_EBRACE);
   1023   1.1       jtc 			SETERROR(REG_BADBR);
   1024   1.1       jtc 		}
   1025  1.39  christos 	} else if (c == '$')     /* $ (but not \$) ends it */
   1026  1.39  christos 		return (true);
   1027   1.1       jtc 
   1028  1.39  christos 	return (false);
   1029   1.1       jtc }
   1030   1.1       jtc 
   1031   1.1       jtc /*
   1032   1.1       jtc  - p_count - parse a repetition count
   1033   1.9     perry  == static int p_count(struct parse *p);
   1034   1.1       jtc  */
   1035   1.1       jtc static int			/* the value */
   1036  1.39  christos p_count(struct parse *p)
   1037   1.1       jtc {
   1038   1.9     perry 	int count = 0;
   1039   1.9     perry 	int ndigits = 0;
   1040   1.1       jtc 
   1041  1.39  christos 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
   1042   1.1       jtc 		count = count*10 + (GETNEXT() - '0');
   1043   1.1       jtc 		ndigits++;
   1044   1.1       jtc 	}
   1045   1.1       jtc 
   1046  1.39  christos 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
   1047   1.1       jtc 	return(count);
   1048   1.1       jtc }
   1049   1.1       jtc 
   1050   1.1       jtc /*
   1051   1.1       jtc  - p_bracket - parse a bracketed character list
   1052   1.9     perry  == static void p_bracket(struct parse *p);
   1053   1.1       jtc  */
   1054   1.1       jtc static void
   1055  1.39  christos p_bracket(struct parse *p)
   1056   1.1       jtc {
   1057  1.14     lukem 	cset *cs;
   1058  1.39  christos 	wint_t ch;
   1059  1.14     lukem 
   1060   1.1       jtc 	/* Dept of Truly Sickening Special-Case Kludges */
   1061  1.39  christos 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
   1062   1.1       jtc 		EMIT(OBOW, 0);
   1063   1.1       jtc 		NEXTn(6);
   1064   1.1       jtc 		return;
   1065   1.1       jtc 	}
   1066  1.39  christos 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
   1067   1.1       jtc 		EMIT(OEOW, 0);
   1068   1.1       jtc 		NEXTn(6);
   1069   1.1       jtc 		return;
   1070   1.1       jtc 	}
   1071   1.1       jtc 
   1072  1.39  christos 	if ((cs = allocset(p)) == NULL)
   1073  1.39  christos 		return;
   1074  1.39  christos 
   1075  1.39  christos 	if (p->g->cflags&REG_ICASE)
   1076  1.39  christos 		cs->icase = 1;
   1077   1.1       jtc 	if (EAT('^'))
   1078  1.39  christos 		cs->invert = 1;
   1079   1.1       jtc 	if (EAT(']'))
   1080  1.39  christos 		CHadd(p, cs, ']');
   1081   1.1       jtc 	else if (EAT('-'))
   1082  1.39  christos 		CHadd(p, cs, '-');
   1083   1.1       jtc 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
   1084   1.1       jtc 		p_b_term(p, cs);
   1085   1.1       jtc 	if (EAT('-'))
   1086  1.39  christos 		CHadd(p, cs, '-');
   1087  1.39  christos 	(void)MUSTEAT(']', REG_EBRACK);
   1088   1.1       jtc 
   1089   1.1       jtc 	if (p->error != 0)	/* don't mess things up further */
   1090   1.1       jtc 		return;
   1091   1.1       jtc 
   1092  1.39  christos 	if (cs->invert && p->g->cflags&REG_NEWLINE)
   1093  1.39  christos 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
   1094   1.1       jtc 
   1095  1.39  christos 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
   1096  1.39  christos 		ordinary(p, ch);
   1097  1.39  christos 		freeset(p, cs);
   1098  1.39  christos 	} else
   1099  1.40  christos 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1100  1.39  christos }
   1101   1.1       jtc 
   1102  1.39  christos static int
   1103  1.39  christos p_range_cmp(wchar_t c1, wchar_t c2)
   1104  1.39  christos {
   1105  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1106  1.39  christos 	return __wcollate_range_cmp(c1, c2);
   1107  1.42  christos #elif defined(NLS)
   1108  1.39  christos 	/* Copied from libc/collate __wcollate_range_cmp */
   1109  1.39  christos 	wchar_t s1[2], s2[2];
   1110   1.1       jtc 
   1111  1.39  christos 	s1[0] = c1;
   1112  1.39  christos 	s1[1] = L'\0';
   1113  1.39  christos 	s2[0] = c2;
   1114  1.39  christos 	s2[1] = L'\0';
   1115  1.42  christos 	return wcscoll(s1, s2);
   1116  1.42  christos #else
   1117  1.42  christos 	char s1[2], s2[2];
   1118  1.42  christos 
   1119  1.42  christos 	s1[0] = (char)c1;
   1120  1.42  christos 	s1[1] = '\0';
   1121  1.42  christos 	s2[0] = (char)c2;
   1122  1.42  christos 	s2[1] = '\0';
   1123  1.42  christos 	return strcoll(s1, s2);
   1124  1.39  christos #endif
   1125   1.1       jtc }
   1126   1.1       jtc 
   1127   1.1       jtc /*
   1128   1.1       jtc  - p_b_term - parse one term of a bracketed character list
   1129   1.9     perry  == static void p_b_term(struct parse *p, cset *cs);
   1130   1.1       jtc  */
   1131   1.1       jtc static void
   1132  1.39  christos p_b_term(struct parse *p, cset *cs)
   1133   1.1       jtc {
   1134   1.9     perry 	char c;
   1135  1.39  christos 	wint_t start, finish;
   1136  1.39  christos 	wint_t i;
   1137  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1138  1.39  christos 	struct xlocale_collate *table =
   1139  1.39  christos 		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
   1140  1.39  christos #endif
   1141   1.1       jtc 
   1142  1.14     lukem 	_DIAGASSERT(p != NULL);
   1143  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1144  1.14     lukem 
   1145   1.1       jtc 	/* classify what we've got */
   1146   1.1       jtc 	switch ((MORE()) ? PEEK() : '\0') {
   1147   1.1       jtc 	case '[':
   1148   1.1       jtc 		c = (MORE2()) ? PEEK2() : '\0';
   1149   1.1       jtc 		break;
   1150   1.1       jtc 	case '-':
   1151   1.1       jtc 		SETERROR(REG_ERANGE);
   1152   1.1       jtc 		return;			/* NOTE RETURN */
   1153   1.1       jtc 	default:
   1154   1.1       jtc 		c = '\0';
   1155   1.1       jtc 		break;
   1156   1.1       jtc 	}
   1157   1.1       jtc 
   1158   1.1       jtc 	switch (c) {
   1159   1.1       jtc 	case ':':		/* character class */
   1160   1.1       jtc 		NEXT2();
   1161  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1162   1.1       jtc 		c = PEEK();
   1163  1.39  christos 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
   1164   1.1       jtc 		p_b_cclass(p, cs);
   1165  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1166  1.39  christos 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
   1167   1.1       jtc 		break;
   1168   1.1       jtc 	case '=':		/* equivalence class */
   1169   1.1       jtc 		NEXT2();
   1170  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1171   1.1       jtc 		c = PEEK();
   1172  1.39  christos 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
   1173   1.1       jtc 		p_b_eclass(p, cs);
   1174  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1175  1.39  christos 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
   1176   1.1       jtc 		break;
   1177   1.1       jtc 	default:		/* symbol, ordinary character, or range */
   1178   1.1       jtc 		start = p_b_symbol(p);
   1179   1.1       jtc 		if (SEE('-') && MORE2() && PEEK2() != ']') {
   1180   1.1       jtc 			/* range */
   1181   1.1       jtc 			NEXT();
   1182   1.1       jtc 			if (EAT('-'))
   1183   1.1       jtc 				finish = '-';
   1184   1.1       jtc 			else
   1185   1.1       jtc 				finish = p_b_symbol(p);
   1186   1.1       jtc 		} else
   1187   1.1       jtc 			finish = start;
   1188  1.39  christos 		if (start == finish)
   1189  1.39  christos 			CHadd(p, cs, start);
   1190  1.39  christos 		else {
   1191  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1192  1.39  christos 			if (table->__collate_load_error || MB_CUR_MAX > 1) {
   1193  1.39  christos #else
   1194  1.39  christos 			if (MB_CUR_MAX > 1) {
   1195  1.39  christos #endif
   1196  1.39  christos 				(void)REQUIRE(start <= finish, REG_ERANGE);
   1197  1.39  christos 				CHaddrange(p, cs, start, finish);
   1198  1.39  christos 			} else {
   1199  1.39  christos 				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
   1200  1.39  christos 				for (i = 0; i <= UCHAR_MAX; i++) {
   1201  1.39  christos 					if (p_range_cmp(start, i) <= 0 &&
   1202  1.39  christos 					    p_range_cmp(i, finish) <= 0 )
   1203  1.39  christos 						CHadd(p, cs, i);
   1204  1.39  christos 				}
   1205  1.39  christos 			}
   1206  1.39  christos 		}
   1207  1.39  christos 		break;
   1208  1.39  christos 	}
   1209  1.39  christos }
   1210  1.39  christos 
   1211  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
   1212  1.39  christos /*
   1213  1.39  christos  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
   1214  1.39  christos  == static int p_b_pseudoclass(struct parse *p, char c)
   1215  1.39  christos  */
   1216  1.39  christos static int
   1217  1.39  christos p_b_pseudoclass(struct parse *p, char c) {
   1218  1.39  christos 	cset *cs;
   1219  1.39  christos 
   1220  1.39  christos 	if ((cs = allocset(p)) == NULL)
   1221  1.39  christos 		return(0);
   1222  1.39  christos 
   1223  1.39  christos 	if (p->g->cflags&REG_ICASE)
   1224  1.39  christos 		cs->icase = 1;
   1225  1.39  christos 
   1226  1.39  christos 	switch (c) {
   1227  1.39  christos 	case 'W':
   1228  1.39  christos 		cs->invert = 1;
   1229  1.39  christos 		/* FALLTHROUGH */
   1230  1.39  christos 	case 'w':
   1231  1.39  christos 		p_b_cclass_named(p, cs, "alnum");
   1232   1.1       jtc 		break;
   1233  1.39  christos 	case 'S':
   1234  1.39  christos 		cs->invert = 1;
   1235  1.39  christos 		/* FALLTHROUGH */
   1236  1.39  christos 	case 's':
   1237  1.39  christos 		p_b_cclass_named(p, cs, "space");
   1238  1.39  christos 		break;
   1239  1.39  christos 	default:
   1240  1.39  christos 		return(0);
   1241   1.1       jtc 	}
   1242  1.39  christos 
   1243  1.40  christos 	EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1244  1.39  christos 	return(1);
   1245   1.1       jtc }
   1246  1.39  christos #endif
   1247   1.1       jtc 
   1248   1.1       jtc /*
   1249   1.1       jtc  - p_b_cclass - parse a character-class name and deal with it
   1250   1.9     perry  == static void p_b_cclass(struct parse *p, cset *cs);
   1251   1.1       jtc  */
   1252   1.1       jtc static void
   1253  1.39  christos p_b_cclass(struct parse *p, cset *cs)
   1254   1.1       jtc {
   1255  1.39  christos 	const char *sp = p->next;
   1256   1.9     perry 	size_t len;
   1257  1.39  christos 	char clname[16];
   1258   1.1       jtc 
   1259  1.39  christos 	while (MORE() && isalpha((uch)PEEK()))
   1260   1.1       jtc 		NEXT();
   1261   1.1       jtc 	len = p->next - sp;
   1262  1.39  christos 	if (len >= sizeof(clname) - 1) {
   1263   1.1       jtc 		SETERROR(REG_ECTYPE);
   1264   1.1       jtc 		return;
   1265   1.1       jtc 	}
   1266  1.39  christos 	memcpy(clname, sp, len);
   1267  1.39  christos 	clname[len] = '\0';
   1268  1.39  christos 
   1269  1.39  christos 	p_b_cclass_named(p, cs, clname);
   1270  1.39  christos }
   1271  1.42  christos 
   1272  1.39  christos /*
   1273  1.39  christos  - p_b_cclass_named - deal with a named character class
   1274  1.39  christos  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
   1275  1.39  christos  */
   1276  1.39  christos static void
   1277  1.39  christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
   1278  1.39  christos 	wctype_t wct;
   1279   1.1       jtc 
   1280  1.39  christos 	if ((wct = wctype(clname)) == 0) {
   1281  1.39  christos 		SETERROR(REG_ECTYPE);
   1282  1.39  christos 		return;
   1283  1.39  christos 	}
   1284  1.39  christos 	CHaddtype(p, cs, wct);
   1285   1.1       jtc }
   1286   1.1       jtc 
   1287   1.1       jtc /*
   1288   1.1       jtc  - p_b_eclass - parse an equivalence-class name and deal with it
   1289   1.9     perry  == static void p_b_eclass(struct parse *p, cset *cs);
   1290   1.1       jtc  *
   1291   1.1       jtc  * This implementation is incomplete. xxx
   1292   1.1       jtc  */
   1293   1.1       jtc static void
   1294  1.39  christos p_b_eclass(struct parse *p, cset *cs)
   1295   1.1       jtc {
   1296  1.39  christos 	wint_t c;
   1297   1.1       jtc 
   1298  1.14     lukem 	_DIAGASSERT(p != NULL);
   1299  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1300  1.14     lukem 
   1301   1.1       jtc 	c = p_b_coll_elem(p, '=');
   1302  1.39  christos 	CHadd(p, cs, c);
   1303   1.1       jtc }
   1304   1.1       jtc 
   1305   1.1       jtc /*
   1306   1.1       jtc  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
   1307  1.39  christos  == static wint_t p_b_symbol(struct parse *p);
   1308   1.1       jtc  */
   1309  1.39  christos static wint_t			/* value of symbol */
   1310  1.39  christos p_b_symbol(struct parse *p)
   1311   1.1       jtc {
   1312  1.39  christos 	wint_t value;
   1313   1.1       jtc 
   1314  1.14     lukem 	_DIAGASSERT(p != NULL);
   1315  1.14     lukem 
   1316  1.39  christos 	(void)REQUIRE(MORE(), REG_EBRACK);
   1317   1.1       jtc 	if (!EATTWO('[', '.'))
   1318  1.39  christos 		return(WGETNEXT());
   1319   1.1       jtc 
   1320   1.1       jtc 	/* collating symbol */
   1321   1.1       jtc 	value = p_b_coll_elem(p, '.');
   1322  1.39  christos 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
   1323   1.1       jtc 	return(value);
   1324   1.1       jtc }
   1325   1.1       jtc 
   1326   1.1       jtc /*
   1327   1.1       jtc  - p_b_coll_elem - parse a collating-element name and look it up
   1328  1.39  christos  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
   1329   1.1       jtc  */
   1330  1.39  christos static wint_t			/* value of collating element */
   1331  1.39  christos p_b_coll_elem(struct parse *p,
   1332  1.39  christos 	wint_t endc)		/* name ended by endc,']' */
   1333  1.39  christos {
   1334  1.39  christos 	const char *sp = p->next;
   1335  1.39  christos 	struct cname *cp;
   1336  1.42  christos 	size_t len;
   1337   1.1       jtc 
   1338  1.14     lukem 	_DIAGASSERT(p != NULL);
   1339  1.14     lukem 
   1340   1.1       jtc 	while (MORE() && !SEETWO(endc, ']'))
   1341   1.1       jtc 		NEXT();
   1342   1.1       jtc 	if (!MORE()) {
   1343   1.1       jtc 		SETERROR(REG_EBRACK);
   1344   1.1       jtc 		return(0);
   1345   1.1       jtc 	}
   1346   1.1       jtc 	len = p->next - sp;
   1347   1.1       jtc 	for (cp = cnames; cp->name != NULL; cp++)
   1348  1.37  christos 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
   1349   1.1       jtc 			return(cp->code);	/* known name */
   1350  1.42  christos #ifdef NLS
   1351  1.42  christos 	mbstate_t mbs;
   1352  1.42  christos 	wchar_t wc;
   1353  1.42  christos 	size_t clen;
   1354  1.42  christos 
   1355  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1356  1.39  christos 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
   1357  1.39  christos 		return (wc);			/* single character */
   1358  1.39  christos 	else if (clen == (size_t)-1 || clen == (size_t)-2)
   1359  1.39  christos 		SETERROR(REG_ILLSEQ);
   1360  1.39  christos 	else
   1361  1.39  christos 		SETERROR(REG_ECOLLATE);		/* neither */
   1362   1.1       jtc 	return(0);
   1363  1.42  christos #else
   1364  1.42  christos 	if (len == 1)
   1365  1.42  christos 		return *sp;    /* single character */
   1366  1.42  christos 	SETERROR(REG_ECOLLATE);                 /* neither */
   1367  1.42  christos 	return 0;
   1368  1.42  christos #endif
   1369   1.1       jtc }
   1370   1.1       jtc 
   1371   1.1       jtc /*
   1372  1.39  christos  - may_escape - determine whether 'ch' is escape-able in the current context
   1373  1.39  christos  == static int may_escape(struct parse *p, const wint_t ch)
   1374  1.39  christos  */
   1375  1.39  christos static bool
   1376  1.39  christos may_escape(struct parse *p, const wint_t ch)
   1377  1.39  christos {
   1378  1.39  christos 
   1379  1.39  christos 	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
   1380  1.39  christos 		return (true);
   1381  1.39  christos 	if (isalpha(ch) || ch == '\'' || ch == '`')
   1382  1.39  christos 		return (false);
   1383  1.39  christos 	return (true);
   1384  1.39  christos #ifdef NOTYET
   1385  1.39  christos 	/*
   1386  1.39  christos 	 * Build a whitelist of characters that may be escaped to produce an
   1387  1.39  christos 	 * ordinary in the current context. This assumes that these have not
   1388  1.39  christos 	 * been otherwise interpreted as a special character. Escaping an
   1389  1.39  christos 	 * ordinary character yields undefined results according to
   1390  1.39  christos 	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
   1391  1.39  christos 	 * advantage of this and use escaped ordinary characters to provide
   1392  1.39  christos 	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
   1393  1.39  christos 	 */
   1394  1.39  christos 	switch(ch) {
   1395  1.39  christos 	case '|':
   1396  1.39  christos 	case '+':
   1397  1.39  christos 	case '?':
   1398  1.39  christos 		/* The above characters may not be escaped in BREs */
   1399  1.39  christos 		if (!(p->g->cflags&REG_EXTENDED))
   1400  1.39  christos 			return (false);
   1401  1.39  christos 		/* Fallthrough */
   1402  1.39  christos 	case '(':
   1403  1.39  christos 	case ')':
   1404  1.39  christos 	case '{':
   1405  1.39  christos 	case '}':
   1406  1.39  christos 	case '.':
   1407  1.39  christos 	case '[':
   1408  1.39  christos 	case ']':
   1409  1.39  christos 	case '\\':
   1410  1.39  christos 	case '*':
   1411  1.39  christos 	case '^':
   1412  1.39  christos 	case '$':
   1413  1.39  christos 		return (true);
   1414  1.39  christos 	default:
   1415  1.39  christos 		return (false);
   1416  1.39  christos 	}
   1417  1.39  christos #endif
   1418  1.39  christos }
   1419  1.39  christos 
   1420  1.39  christos /*
   1421   1.1       jtc  - othercase - return the case counterpart of an alphabetic
   1422  1.39  christos  == static wint_t othercase(wint_t ch);
   1423   1.1       jtc  */
   1424  1.39  christos static wint_t			/* if no counterpart, return ch */
   1425  1.39  christos othercase(wint_t ch)
   1426  1.39  christos {
   1427  1.39  christos 	assert(iswalpha(ch));
   1428  1.39  christos 	if (iswupper(ch))
   1429  1.39  christos 		return(towlower(ch));
   1430  1.39  christos 	else if (iswlower(ch))
   1431  1.39  christos 		return(towupper(ch));
   1432   1.1       jtc 	else			/* peculiar, but could happen */
   1433   1.1       jtc 		return(ch);
   1434   1.1       jtc }
   1435   1.1       jtc 
   1436   1.1       jtc /*
   1437   1.1       jtc  - bothcases - emit a dualcase version of a two-case character
   1438  1.39  christos  == static void bothcases(struct parse *p, wint_t ch);
   1439   1.1       jtc  *
   1440   1.1       jtc  * Boy, is this implementation ever a kludge...
   1441   1.1       jtc  */
   1442   1.1       jtc static void
   1443  1.39  christos bothcases(struct parse *p, wint_t ch)
   1444   1.1       jtc {
   1445  1.39  christos 	const char *oldnext = p->next;
   1446  1.39  christos 	const char *oldend = p->end;
   1447  1.39  christos 	char bracket[3 + MB_LEN_MAX];
   1448  1.39  christos 	size_t n;
   1449   1.1       jtc 
   1450  1.14     lukem 	_DIAGASSERT(p != NULL);
   1451  1.14     lukem 
   1452   1.1       jtc 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
   1453   1.1       jtc 	p->next = bracket;
   1454  1.42  christos #ifdef NLS
   1455  1.42  christos 	mbstate_t mbs;
   1456  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1457  1.39  christos 	n = wcrtomb(bracket, ch, &mbs);
   1458  1.39  christos 	assert(n != (size_t)-1);
   1459  1.42  christos #else
   1460  1.42  christos 	n = 0;
   1461  1.42  christos 	bracket[n++] = ch;
   1462  1.42  christos #endif
   1463  1.39  christos 	bracket[n] = ']';
   1464  1.39  christos 	bracket[n + 1] = '\0';
   1465  1.39  christos 	p->end = bracket+n+1;
   1466   1.1       jtc 	p_bracket(p);
   1467  1.39  christos 	assert(p->next == p->end);
   1468   1.1       jtc 	p->next = oldnext;
   1469   1.1       jtc 	p->end = oldend;
   1470   1.1       jtc }
   1471   1.1       jtc 
   1472   1.1       jtc /*
   1473   1.1       jtc  - ordinary - emit an ordinary character
   1474  1.39  christos  == static void ordinary(struct parse *p, wint_t ch);
   1475   1.1       jtc  */
   1476   1.1       jtc static void
   1477  1.39  christos ordinary(struct parse *p, wint_t ch)
   1478   1.1       jtc {
   1479  1.39  christos 	cset *cs;
   1480   1.1       jtc 
   1481  1.14     lukem 	_DIAGASSERT(p != NULL);
   1482  1.14     lukem 
   1483  1.39  christos 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
   1484  1.39  christos 		bothcases(p, ch);
   1485  1.40  christos 	else if ((wint_t)(ch & OPDMASK) == ch)
   1486  1.40  christos 		EMIT(OCHAR, (size_t)ch);
   1487   1.1       jtc 	else {
   1488  1.39  christos 		/*
   1489  1.39  christos 		 * Kludge: character is too big to fit into an OCHAR operand.
   1490  1.39  christos 		 * Emit a singleton set.
   1491  1.39  christos 		 */
   1492  1.39  christos 		if ((cs = allocset(p)) == NULL)
   1493  1.39  christos 			return;
   1494  1.39  christos 		CHadd(p, cs, ch);
   1495  1.40  christos 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1496   1.1       jtc 	}
   1497   1.1       jtc }
   1498   1.1       jtc 
   1499   1.1       jtc /*
   1500   1.1       jtc  - nonnewline - emit REG_NEWLINE version of OANY
   1501   1.9     perry  == static void nonnewline(struct parse *p);
   1502   1.1       jtc  *
   1503   1.1       jtc  * Boy, is this implementation ever a kludge...
   1504   1.1       jtc  */
   1505   1.1       jtc static void
   1506  1.39  christos nonnewline(struct parse *p)
   1507   1.1       jtc {
   1508  1.39  christos 	const char *oldnext = p->next;
   1509  1.39  christos 	const char *oldend = p->end;
   1510   1.1       jtc 	char bracket[4];
   1511   1.1       jtc 
   1512  1.14     lukem 	_DIAGASSERT(p != NULL);
   1513  1.14     lukem 
   1514   1.1       jtc 	p->next = bracket;
   1515   1.1       jtc 	p->end = bracket+3;
   1516   1.1       jtc 	bracket[0] = '^';
   1517   1.1       jtc 	bracket[1] = '\n';
   1518   1.1       jtc 	bracket[2] = ']';
   1519   1.1       jtc 	bracket[3] = '\0';
   1520   1.1       jtc 	p_bracket(p);
   1521   1.1       jtc 	assert(p->next == bracket+3);
   1522   1.1       jtc 	p->next = oldnext;
   1523   1.1       jtc 	p->end = oldend;
   1524   1.1       jtc }
   1525   1.1       jtc 
   1526   1.1       jtc /*
   1527   1.1       jtc  - repeat - generate code for a bounded repetition, recursively if needed
   1528  1.39  christos  == static void repeat(struct parse *p, sopno start, int from, int to);
   1529   1.1       jtc  */
   1530   1.1       jtc static void
   1531  1.39  christos repeat(struct parse *p,
   1532  1.39  christos 	sopno start,		/* operand from here to end of strip */
   1533  1.39  christos 	int from,		/* repeated from this number */
   1534  1.39  christos 	int to)			/* to this number of times (maybe INFINITY) */
   1535   1.1       jtc {
   1536  1.39  christos 	sopno finish = HERE();
   1537   1.1       jtc #	define	N	2
   1538   1.1       jtc #	define	INF	3
   1539   1.1       jtc #	define	REP(f, t)	((f)*8 + (t))
   1540   1.1       jtc #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
   1541   1.9     perry 	sopno copy;
   1542   1.1       jtc 
   1543  1.14     lukem 	_DIAGASSERT(p != NULL);
   1544  1.14     lukem 
   1545  1.39  christos 	if (p->error != 0)	/* head off possible runaway recursion */
   1546  1.30  christos 		return;
   1547  1.30  christos 
   1548   1.1       jtc 	assert(from <= to);
   1549   1.1       jtc 
   1550   1.1       jtc 	switch (REP(MAP(from), MAP(to))) {
   1551   1.1       jtc 	case REP(0, 0):			/* must be user doing this */
   1552   1.1       jtc 		DROP(finish-start);	/* drop the operand */
   1553   1.1       jtc 		break;
   1554   1.1       jtc 	case REP(0, 1):			/* as x{1,1}? */
   1555   1.1       jtc 	case REP(0, N):			/* as x{1,n}? */
   1556   1.1       jtc 	case REP(0, INF):		/* as x{1,}? */
   1557   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
   1558   1.4       jtc 		INSERT(OCH_, start);		/* offset is wrong... */
   1559  1.39  christos 		repeat(p, start+1, 1, to);
   1560   1.4       jtc 		ASTERN(OOR1, start);
   1561   1.1       jtc 		AHEAD(start);			/* ... fix it */
   1562   1.4       jtc 		EMIT(OOR2, 0);
   1563   1.4       jtc 		AHEAD(THERE());
   1564   1.4       jtc 		ASTERN(O_CH, THERETHERE());
   1565   1.1       jtc 		break;
   1566   1.1       jtc 	case REP(1, 1):			/* trivial case */
   1567   1.1       jtc 		/* done */
   1568   1.1       jtc 		break;
   1569   1.1       jtc 	case REP(1, N):			/* as x?x{1,n-1} */
   1570   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
   1571   1.4       jtc 		INSERT(OCH_, start);
   1572   1.4       jtc 		ASTERN(OOR1, start);
   1573   1.4       jtc 		AHEAD(start);
   1574   1.4       jtc 		EMIT(OOR2, 0);			/* offset very wrong... */
   1575   1.4       jtc 		AHEAD(THERE());			/* ...so fix it */
   1576   1.4       jtc 		ASTERN(O_CH, THERETHERE());
   1577   1.1       jtc 		copy = dupl(p, start+1, finish+1);
   1578   1.4       jtc 		assert(copy == finish+4);
   1579  1.39  christos 		repeat(p, copy, 1, to-1);
   1580   1.1       jtc 		break;
   1581   1.1       jtc 	case REP(1, INF):		/* as x+ */
   1582   1.1       jtc 		INSERT(OPLUS_, start);
   1583   1.1       jtc 		ASTERN(O_PLUS, start);
   1584   1.1       jtc 		break;
   1585   1.1       jtc 	case REP(N, N):			/* as xx{m-1,n-1} */
   1586   1.1       jtc 		copy = dupl(p, start, finish);
   1587  1.39  christos 		repeat(p, copy, from-1, to-1);
   1588   1.1       jtc 		break;
   1589   1.1       jtc 	case REP(N, INF):		/* as xx{n-1,INF} */
   1590   1.1       jtc 		copy = dupl(p, start, finish);
   1591  1.39  christos 		repeat(p, copy, from-1, to);
   1592   1.1       jtc 		break;
   1593   1.1       jtc 	default:			/* "can't happen" */
   1594   1.1       jtc 		SETERROR(REG_ASSERT);	/* just in case */
   1595   1.1       jtc 		break;
   1596   1.1       jtc 	}
   1597   1.1       jtc }
   1598   1.1       jtc 
   1599   1.1       jtc /*
   1600  1.39  christos  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
   1601  1.39  christos  - character from the parse struct, signals a REG_ILLSEQ error if the
   1602  1.39  christos  - character can't be converted. Returns the number of bytes consumed.
   1603  1.39  christos  */
   1604  1.39  christos static wint_t
   1605  1.39  christos wgetnext(struct parse *p)
   1606  1.39  christos {
   1607  1.42  christos #ifdef NLS
   1608  1.39  christos 	mbstate_t mbs;
   1609  1.39  christos 	wchar_t wc;
   1610  1.39  christos 	size_t n;
   1611  1.39  christos 
   1612  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1613  1.40  christos 	n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
   1614  1.39  christos 	if (n == (size_t)-1 || n == (size_t)-2) {
   1615  1.39  christos 		SETERROR(REG_ILLSEQ);
   1616  1.39  christos 		return (0);
   1617  1.39  christos 	}
   1618  1.39  christos 	if (n == 0)
   1619  1.39  christos 		n = 1;
   1620  1.39  christos 	p->next += n;
   1621  1.42  christos 	return wc;
   1622  1.42  christos #else
   1623  1.42  christos 	return *p->next++;
   1624  1.42  christos #endif
   1625  1.39  christos }
   1626  1.39  christos 
   1627  1.39  christos /*
   1628   1.1       jtc  - seterr - set an error condition
   1629   1.9     perry  == static int seterr(struct parse *p, int e);
   1630   1.1       jtc  */
   1631   1.1       jtc static int			/* useless but makes type checking happy */
   1632  1.39  christos seterr(struct parse *p, int e)
   1633   1.1       jtc {
   1634  1.14     lukem 
   1635  1.14     lukem 	_DIAGASSERT(p != NULL);
   1636  1.14     lukem 
   1637   1.1       jtc 	if (p->error == 0)	/* keep earliest error condition */
   1638   1.1       jtc 		p->error = e;
   1639   1.1       jtc 	p->next = nuls;		/* try to bring things to a halt */
   1640   1.1       jtc 	p->end = nuls;
   1641   1.1       jtc 	return(0);		/* make the return value well-defined */
   1642   1.1       jtc }
   1643   1.1       jtc 
   1644   1.1       jtc /*
   1645   1.1       jtc  - allocset - allocate a set of characters for []
   1646   1.9     perry  == static cset *allocset(struct parse *p);
   1647   1.1       jtc  */
   1648   1.1       jtc static cset *
   1649  1.39  christos allocset(struct parse *p)
   1650   1.1       jtc {
   1651  1.39  christos 	cset *cs, *ncs;
   1652   1.1       jtc 
   1653  1.14     lukem 	_DIAGASSERT(p != NULL);
   1654  1.14     lukem 
   1655  1.39  christos 	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
   1656  1.39  christos 	if (ncs == NULL) {
   1657  1.39  christos 		SETERROR(REG_ESPACE);
   1658  1.39  christos 		return (NULL);
   1659   1.1       jtc 	}
   1660  1.39  christos 	p->g->sets = ncs;
   1661  1.39  christos 	cs = &p->g->sets[p->g->ncsets++];
   1662  1.39  christos 	memset(cs, 0, sizeof(*cs));
   1663   1.1       jtc 
   1664   1.1       jtc 	return(cs);
   1665   1.1       jtc }
   1666   1.1       jtc 
   1667   1.1       jtc /*
   1668   1.1       jtc  - freeset - free a now-unused set
   1669   1.9     perry  == static void freeset(struct parse *p, cset *cs);
   1670   1.1       jtc  */
   1671   1.1       jtc static void
   1672  1.39  christos freeset(struct parse *p, cset *cs)
   1673   1.1       jtc {
   1674  1.14     lukem 	cset *top;
   1675  1.14     lukem 
   1676  1.14     lukem 	_DIAGASSERT(p != NULL);
   1677  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1678  1.14     lukem 
   1679  1.14     lukem 	top = &p->g->sets[p->g->ncsets];
   1680   1.1       jtc 
   1681  1.39  christos 	free(cs->wides);
   1682  1.39  christos 	free(cs->ranges);
   1683  1.39  christos 	free(cs->types);
   1684  1.39  christos 	memset(cs, 0, sizeof(*cs));
   1685   1.1       jtc 	if (cs == top-1)	/* recover only the easy case */
   1686   1.1       jtc 		p->g->ncsets--;
   1687   1.1       jtc }
   1688   1.1       jtc 
   1689   1.1       jtc /*
   1690  1.39  christos  - singleton - Determine whether a set contains only one character,
   1691  1.39  christos  - returning it if so, otherwise returning OUT.
   1692   1.1       jtc  */
   1693  1.39  christos static wint_t
   1694  1.39  christos singleton(cset *cs)
   1695   1.1       jtc {
   1696  1.39  christos 	wint_t i, s, n;
   1697   1.1       jtc 
   1698  1.39  christos 	for (i = n = 0; i < NC; i++)
   1699  1.39  christos 		if (CHIN(cs, i)) {
   1700   1.1       jtc 			n++;
   1701  1.39  christos 			s = i;
   1702  1.39  christos 		}
   1703  1.39  christos 	if (n == 1)
   1704  1.39  christos 		return (s);
   1705  1.39  christos 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
   1706  1.39  christos 	    cs->icase == 0)
   1707  1.39  christos 		return (cs->wides[0]);
   1708  1.39  christos 	/* Don't bother handling the other cases. */
   1709  1.39  christos 	return (OUT);
   1710   1.1       jtc }
   1711   1.1       jtc 
   1712   1.1       jtc /*
   1713  1.39  christos  - CHadd - add character to character set.
   1714   1.1       jtc  */
   1715   1.1       jtc static void
   1716  1.39  christos CHadd(struct parse *p, cset *cs, wint_t ch)
   1717   1.1       jtc {
   1718  1.39  christos 	wint_t nch, *newwides;
   1719  1.14     lukem 
   1720  1.14     lukem 	_DIAGASSERT(p != NULL);
   1721  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1722  1.14     lukem 
   1723  1.39  christos 	assert(ch >= 0);
   1724  1.39  christos 	if (ch < NC)
   1725  1.40  christos 		cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
   1726  1.39  christos 	else {
   1727  1.39  christos 		newwides = reallocarray(cs->wides, cs->nwides + 1,
   1728  1.39  christos 		    sizeof(*cs->wides));
   1729  1.39  christos 		if (newwides == NULL) {
   1730  1.39  christos 			SETERROR(REG_ESPACE);
   1731  1.39  christos 			return;
   1732  1.39  christos 		}
   1733  1.39  christos 		cs->wides = newwides;
   1734  1.39  christos 		cs->wides[cs->nwides++] = ch;
   1735  1.39  christos 	}
   1736  1.39  christos 	if (cs->icase) {
   1737  1.39  christos 		if ((nch = towlower(ch)) < NC)
   1738  1.40  christos 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
   1739  1.39  christos 		if ((nch = towupper(ch)) < NC)
   1740  1.40  christos 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
   1741   1.1       jtc 	}
   1742   1.1       jtc }
   1743   1.1       jtc 
   1744   1.1       jtc /*
   1745  1.39  christos  - CHaddrange - add all characters in the range [min,max] to a character set.
   1746   1.1       jtc  */
   1747   1.1       jtc static void
   1748  1.39  christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
   1749   1.1       jtc {
   1750  1.39  christos 	crange *newranges;
   1751  1.14     lukem 
   1752  1.39  christos 	_DIAGASSERT(p != NULL);
   1753  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1754  1.14     lukem 
   1755  1.39  christos 	for (; min < NC && min <= max; min++)
   1756  1.39  christos 		CHadd(p, cs, min);
   1757  1.39  christos 	if (min >= max)
   1758  1.39  christos 		return;
   1759  1.39  christos 	newranges = reallocarray(cs->ranges, cs->nranges + 1,
   1760  1.39  christos 	    sizeof(*cs->ranges));
   1761  1.39  christos 	if (newranges == NULL) {
   1762  1.39  christos 		SETERROR(REG_ESPACE);
   1763   1.1       jtc 		return;
   1764   1.1       jtc 	}
   1765  1.39  christos 	cs->ranges = newranges;
   1766  1.39  christos 	cs->ranges[cs->nranges].min = min;
   1767  1.39  christos 	cs->ranges[cs->nranges].max = max;
   1768  1.39  christos 	cs->nranges++;
   1769   1.1       jtc }
   1770   1.1       jtc 
   1771   1.1       jtc /*
   1772  1.39  christos  - CHaddtype - add all characters of a certain type to a character set.
   1773   1.1       jtc  */
   1774   1.1       jtc static void
   1775  1.39  christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
   1776   1.1       jtc {
   1777  1.39  christos 	wint_t i;
   1778  1.39  christos 	wctype_t *newtypes;
   1779  1.14     lukem 
   1780  1.14     lukem 	_DIAGASSERT(p != NULL);
   1781  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1782  1.14     lukem 
   1783  1.39  christos 	for (i = 0; i < NC; i++)
   1784  1.39  christos 		if (iswctype(i, wct))
   1785  1.39  christos 			CHadd(p, cs, i);
   1786  1.39  christos 	newtypes = reallocarray(cs->types, cs->ntypes + 1,
   1787  1.39  christos 	    sizeof(*cs->types));
   1788  1.39  christos 	if (newtypes == NULL) {
   1789  1.39  christos 		SETERROR(REG_ESPACE);
   1790   1.1       jtc 		return;
   1791  1.39  christos 	}
   1792  1.39  christos 	cs->types = newtypes;
   1793  1.39  christos 	cs->types[cs->ntypes++] = wct;
   1794   1.1       jtc }
   1795   1.1       jtc 
   1796   1.1       jtc /*
   1797   1.1       jtc  - dupl - emit a duplicate of a bunch of sops
   1798   1.9     perry  == static sopno dupl(struct parse *p, sopno start, sopno finish);
   1799   1.1       jtc  */
   1800   1.1       jtc static sopno			/* start of duplicate */
   1801  1.39  christos dupl(struct parse *p,
   1802  1.39  christos 	sopno start,		/* from here */
   1803  1.39  christos 	sopno finish)		/* to this less one */
   1804   1.1       jtc {
   1805  1.39  christos 	sopno ret = HERE();
   1806   1.9     perry 	sopno len = finish - start;
   1807   1.1       jtc 
   1808  1.14     lukem 	_DIAGASSERT(p != NULL);
   1809  1.14     lukem 
   1810   1.1       jtc 	assert(finish >= start);
   1811   1.1       jtc 	if (len == 0)
   1812   1.1       jtc 		return(ret);
   1813  1.39  christos 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
   1814  1.39  christos 		return(ret);
   1815  1.40  christos 	(void) memcpy(p->strip + p->slen,
   1816  1.40  christos 	    p->strip + start, len * sizeof(*p->strip));
   1817   1.1       jtc 	p->slen += len;
   1818   1.1       jtc 	return(ret);
   1819   1.1       jtc }
   1820   1.1       jtc 
   1821   1.1       jtc /*
   1822   1.1       jtc  - doemit - emit a strip operator
   1823   1.9     perry  == static void doemit(struct parse *p, sop op, size_t opnd);
   1824   1.1       jtc  *
   1825   1.1       jtc  * It might seem better to implement this as a macro with a function as
   1826   1.1       jtc  * hard-case backup, but it's just too big and messy unless there are
   1827   1.1       jtc  * some changes to the data structures.  Maybe later.
   1828   1.1       jtc  */
   1829   1.1       jtc static void
   1830  1.39  christos doemit(struct parse *p, sop op, size_t opnd)
   1831   1.1       jtc {
   1832   1.1       jtc 	/* avoid making error situations worse */
   1833   1.1       jtc 	if (p->error != 0)
   1834   1.1       jtc 		return;
   1835   1.1       jtc 
   1836  1.39  christos 	_DIAGASSERT(p != NULL);
   1837  1.39  christos 
   1838   1.1       jtc 	/* deal with oversize operands ("can't happen", more or less) */
   1839   1.1       jtc 	assert(opnd < 1<<OPSHIFT);
   1840   1.1       jtc 
   1841   1.1       jtc 	/* deal with undersized strip */
   1842   1.1       jtc 	if (p->slen >= p->ssize)
   1843  1.30  christos 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
   1844  1.30  christos 			return;
   1845   1.1       jtc 
   1846   1.1       jtc 	/* finally, it's all reduced to the easy case */
   1847  1.40  christos 	p->strip[p->slen++] = (sopno)SOP(op, opnd);
   1848   1.1       jtc }
   1849   1.1       jtc 
   1850   1.1       jtc /*
   1851   1.1       jtc  - doinsert - insert a sop into the strip
   1852   1.9     perry  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
   1853   1.1       jtc  */
   1854   1.1       jtc static void
   1855  1.39  christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
   1856   1.1       jtc {
   1857   1.9     perry 	sopno sn;
   1858   1.9     perry 	sop s;
   1859   1.9     perry 	int i;
   1860   1.1       jtc 
   1861  1.14     lukem 	_DIAGASSERT(p != NULL);
   1862  1.14     lukem 
   1863   1.1       jtc 	/* avoid making error situations worse */
   1864   1.1       jtc 	if (p->error != 0)
   1865   1.1       jtc 		return;
   1866   1.1       jtc 
   1867   1.1       jtc 	sn = HERE();
   1868   1.1       jtc 	EMIT(op, opnd);		/* do checks, ensure space */
   1869   1.1       jtc 	assert(HERE() == sn+1);
   1870   1.1       jtc 	s = p->strip[sn];
   1871   1.1       jtc 
   1872   1.1       jtc 	/* adjust paren pointers */
   1873   1.1       jtc 	assert(pos > 0);
   1874   1.1       jtc 	for (i = 1; i < NPAREN; i++) {
   1875   1.1       jtc 		if (p->pbegin[i] >= pos) {
   1876   1.1       jtc 			p->pbegin[i]++;
   1877   1.1       jtc 		}
   1878   1.1       jtc 		if (p->pend[i] >= pos) {
   1879   1.1       jtc 			p->pend[i]++;
   1880   1.1       jtc 		}
   1881   1.1       jtc 	}
   1882   1.1       jtc 
   1883  1.40  christos 	memmove(&p->strip[pos+1], &p->strip[pos],
   1884  1.40  christos 	    (HERE()-pos-1)*sizeof(*p->strip));
   1885   1.1       jtc 	p->strip[pos] = s;
   1886   1.1       jtc }
   1887   1.1       jtc 
   1888   1.1       jtc /*
   1889   1.1       jtc  - dofwd - complete a forward reference
   1890   1.9     perry  == static void dofwd(struct parse *p, sopno pos, sop value);
   1891   1.1       jtc  */
   1892   1.1       jtc static void
   1893  1.39  christos dofwd(struct parse *p, sopno pos, sop value)
   1894   1.1       jtc {
   1895  1.14     lukem 
   1896  1.14     lukem 	_DIAGASSERT(p != NULL);
   1897  1.14     lukem 
   1898   1.1       jtc 	/* avoid making error situations worse */
   1899   1.1       jtc 	if (p->error != 0)
   1900   1.1       jtc 		return;
   1901   1.1       jtc 
   1902   1.1       jtc 	assert(value < 1<<OPSHIFT);
   1903  1.39  christos 	p->strip[pos] = OP(p->strip[pos]) | value;
   1904   1.1       jtc }
   1905   1.1       jtc 
   1906   1.1       jtc /*
   1907   1.1       jtc  - enlarge - enlarge the strip
   1908  1.39  christos  == static int enlarge(struct parse *p, sopno size);
   1909   1.1       jtc  */
   1910  1.30  christos static int
   1911  1.35     joerg enlarge(struct parse *p, sopno size)
   1912   1.1       jtc {
   1913  1.39  christos 	sop *sp;
   1914  1.39  christos 
   1915  1.14     lukem 	_DIAGASSERT(p != NULL);
   1916  1.14     lukem 
   1917   1.1       jtc 	if (p->ssize >= size)
   1918  1.30  christos 		return 1;
   1919   1.1       jtc 
   1920  1.40  christos 	sp = reallocarray(p->strip, size, sizeof(*p->strip));
   1921  1.39  christos 	if (sp == NULL) {
   1922   1.1       jtc 		SETERROR(REG_ESPACE);
   1923  1.30  christos 		return 0;
   1924   1.1       jtc 	}
   1925  1.39  christos 	p->strip = sp;
   1926  1.35     joerg 	p->ssize = size;
   1927  1.30  christos 	return 1;
   1928   1.1       jtc }
   1929   1.1       jtc 
   1930   1.1       jtc /*
   1931   1.1       jtc  - stripsnug - compact the strip
   1932   1.9     perry  == static void stripsnug(struct parse *p, struct re_guts *g);
   1933   1.1       jtc  */
   1934   1.1       jtc static void
   1935  1.39  christos stripsnug(struct parse *p, struct re_guts *g)
   1936   1.1       jtc {
   1937  1.14     lukem 
   1938  1.14     lukem 	_DIAGASSERT(p != NULL);
   1939  1.14     lukem 	_DIAGASSERT(g != NULL);
   1940  1.14     lukem 
   1941   1.1       jtc 	g->nstates = p->slen;
   1942  1.40  christos 	g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
   1943  1.39  christos 	if (g->strip == NULL) {
   1944  1.39  christos 		SETERROR(REG_ESPACE);
   1945  1.39  christos 		g->strip = p->strip;
   1946  1.39  christos 	}
   1947   1.1       jtc }
   1948   1.1       jtc 
   1949   1.1       jtc /*
   1950   1.1       jtc  - findmust - fill in must and mlen with longest mandatory literal string
   1951   1.9     perry  == static void findmust(struct parse *p, struct re_guts *g);
   1952   1.1       jtc  *
   1953   1.1       jtc  * This algorithm could do fancy things like analyzing the operands of |
   1954   1.1       jtc  * for common subsequences.  Someday.  This code is simple and finds most
   1955   1.1       jtc  * of the interesting cases.
   1956   1.1       jtc  *
   1957   1.1       jtc  * Note that must and mlen got initialized during setup.
   1958   1.1       jtc  */
   1959   1.1       jtc static void
   1960  1.39  christos findmust(struct parse *p, struct re_guts *g)
   1961   1.1       jtc {
   1962   1.9     perry 	sop *scan;
   1963   1.7  christos 	sop *start = NULL;
   1964   1.9     perry 	sop *newstart = NULL;
   1965   1.9     perry 	sopno newlen;
   1966   1.9     perry 	sop s;
   1967   1.9     perry 	char *cp;
   1968  1.39  christos 	int offset;
   1969  1.39  christos 	mbstate_t mbs;
   1970   1.1       jtc 
   1971  1.14     lukem 	_DIAGASSERT(p != NULL);
   1972  1.14     lukem 	_DIAGASSERT(g != NULL);
   1973  1.14     lukem 
   1974   1.1       jtc 	/* avoid making error situations worse */
   1975   1.1       jtc 	if (p->error != 0)
   1976   1.1       jtc 		return;
   1977   1.1       jtc 
   1978  1.39  christos #ifdef notyet
   1979  1.39  christos 	/*
   1980  1.39  christos 	 * It's not generally safe to do a ``char'' substring search on
   1981  1.39  christos 	 * multibyte character strings, but it's safe for at least
   1982  1.39  christos 	 * UTF-8 (see RFC 3629).
   1983  1.39  christos 	 */
   1984  1.39  christos 	if (MB_CUR_MAX > 1 &&
   1985  1.39  christos 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
   1986  1.39  christos 		return;
   1987  1.39  christos #endif
   1988  1.39  christos 
   1989   1.1       jtc 	/* find the longest OCHAR sequence in strip */
   1990   1.1       jtc 	newlen = 0;
   1991  1.39  christos 	offset = 0;
   1992  1.39  christos 	g->moffset = 0;
   1993   1.1       jtc 	scan = g->strip + 1;
   1994   1.1       jtc 	do {
   1995   1.1       jtc 		s = *scan++;
   1996   1.1       jtc 		switch (OP(s)) {
   1997   1.1       jtc 		case OCHAR:		/* sequence member */
   1998  1.39  christos 			if (newlen == 0) {		/* new sequence */
   1999  1.39  christos 				memset(&mbs, 0, sizeof(mbs));
   2000   1.1       jtc 				newstart = scan - 1;
   2001  1.39  christos 			}
   2002  1.42  christos #ifdef NLS
   2003  1.42  christos 			char buf[MB_LEN_MAX];
   2004  1.42  christos 			size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
   2005  1.39  christos 			if (clen == (size_t)-1)
   2006  1.39  christos 				goto toohard;
   2007  1.40  christos 			newlen += (sopno)clen;
   2008  1.42  christos #else
   2009  1.42  christos 			newlen++;
   2010  1.42  christos #endif
   2011   1.1       jtc 			break;
   2012   1.1       jtc 		case OPLUS_:		/* things that don't break one */
   2013   1.1       jtc 		case OLPAREN:
   2014   1.1       jtc 		case ORPAREN:
   2015   1.1       jtc 			break;
   2016   1.1       jtc 		case OQUEST_:		/* things that must be skipped */
   2017   1.1       jtc 		case OCH_:
   2018  1.39  christos 			offset = altoffset(scan, offset);
   2019   1.1       jtc 			scan--;
   2020   1.1       jtc 			do {
   2021   1.1       jtc 				scan += OPND(s);
   2022   1.1       jtc 				s = *scan;
   2023   1.1       jtc 				/* assert() interferes w debug printouts */
   2024  1.40  christos 				if (OP(s) != O_QUEST &&
   2025  1.40  christos 				    OP(s) != O_CH && OP(s) != OOR2) {
   2026   1.1       jtc 					g->iflags |= BAD;
   2027   1.1       jtc 					return;
   2028   1.1       jtc 				}
   2029  1.40  christos 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
   2030  1.11  christos 			/* FALLTHROUGH */
   2031  1.39  christos 		case OBOW:		/* things that break a sequence */
   2032  1.39  christos 		case OEOW:
   2033  1.39  christos 		case OBOL:
   2034  1.39  christos 		case OEOL:
   2035  1.39  christos 		case OBOS:
   2036  1.39  christos 		case OEOS:
   2037  1.39  christos 		case OWBND:
   2038  1.39  christos 		case ONWBND:
   2039  1.39  christos 		case O_QUEST:
   2040  1.39  christos 		case O_CH:
   2041  1.39  christos 		case OEND:
   2042  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2043   1.1       jtc 				start = newstart;
   2044   1.1       jtc 				g->mlen = newlen;
   2045  1.39  christos 				if (offset > -1) {
   2046  1.39  christos 					g->moffset += offset;
   2047  1.39  christos 					offset = newlen;
   2048  1.39  christos 				} else
   2049  1.39  christos 					g->moffset = offset;
   2050  1.39  christos 			} else {
   2051  1.39  christos 				if (offset > -1)
   2052  1.39  christos 					offset += newlen;
   2053   1.1       jtc 			}
   2054   1.1       jtc 			newlen = 0;
   2055   1.1       jtc 			break;
   2056  1.39  christos 		case OANY:
   2057  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2058  1.39  christos 				start = newstart;
   2059  1.39  christos 				g->mlen = newlen;
   2060  1.39  christos 				if (offset > -1) {
   2061  1.39  christos 					g->moffset += offset;
   2062  1.39  christos 					offset = newlen;
   2063  1.39  christos 				} else
   2064  1.39  christos 					g->moffset = offset;
   2065  1.39  christos 			} else {
   2066  1.39  christos 				if (offset > -1)
   2067  1.39  christos 					offset += newlen;
   2068  1.39  christos 			}
   2069  1.39  christos 			if (offset > -1)
   2070  1.39  christos 				offset++;
   2071  1.39  christos 			newlen = 0;
   2072  1.39  christos 			break;
   2073  1.39  christos 		case OANYOF:		/* may or may not invalidate offset */
   2074  1.39  christos 			/* First, everything as OANY */
   2075  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2076  1.39  christos 				start = newstart;
   2077  1.39  christos 				g->mlen = newlen;
   2078  1.39  christos 				if (offset > -1) {
   2079  1.39  christos 					g->moffset += offset;
   2080  1.39  christos 					offset = newlen;
   2081  1.39  christos 				} else
   2082  1.39  christos 					g->moffset = offset;
   2083  1.39  christos 			} else {
   2084  1.39  christos 				if (offset > -1)
   2085  1.39  christos 					offset += newlen;
   2086  1.39  christos 			}
   2087  1.39  christos 			if (offset > -1)
   2088  1.39  christos 				offset++;
   2089  1.39  christos 			newlen = 0;
   2090  1.39  christos 			break;
   2091  1.42  christos #ifdef NLS
   2092  1.40  christos 		toohard:/*FALLTHROUGH*/
   2093  1.42  christos #endif
   2094  1.39  christos 		default:
   2095  1.39  christos 			/* Anything here makes it impossible or too hard
   2096  1.39  christos 			 * to calculate the offset -- so we give up;
   2097  1.39  christos 			 * save the last known good offset, in case the
   2098  1.39  christos 			 * must sequence doesn't occur later.
   2099  1.39  christos 			 */
   2100  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2101  1.39  christos 				start = newstart;
   2102  1.39  christos 				g->mlen = newlen;
   2103  1.39  christos 				if (offset > -1)
   2104  1.39  christos 					g->moffset += offset;
   2105  1.39  christos 				else
   2106  1.39  christos 					g->moffset = offset;
   2107  1.39  christos 			}
   2108  1.39  christos 			offset = -1;
   2109  1.39  christos 			newlen = 0;
   2110  1.39  christos 			break;
   2111   1.1       jtc 		}
   2112   1.1       jtc 	} while (OP(s) != OEND);
   2113   1.1       jtc 
   2114  1.39  christos 	if (g->mlen == 0) {		/* there isn't one */
   2115  1.39  christos 		g->moffset = -1;
   2116   1.1       jtc 		return;
   2117  1.39  christos 	}
   2118   1.1       jtc 
   2119   1.1       jtc 	/* turn it into a character string */
   2120   1.1       jtc 	g->must = malloc((size_t)g->mlen + 1);
   2121   1.1       jtc 	if (g->must == NULL) {		/* argh; just forget it */
   2122   1.1       jtc 		g->mlen = 0;
   2123  1.39  christos 		g->moffset = -1;
   2124   1.1       jtc 		return;
   2125   1.1       jtc 	}
   2126   1.1       jtc 	cp = g->must;
   2127   1.1       jtc 	scan = start;
   2128  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   2129  1.39  christos 	while (cp < g->must + g->mlen) {
   2130   1.1       jtc 		while (OP(s = *scan++) != OCHAR)
   2131   1.1       jtc 			continue;
   2132  1.42  christos #ifdef NLS
   2133  1.40  christos 		clen = wcrtomb(cp, (int)OPND(s), &mbs);
   2134  1.39  christos 		assert(clen != (size_t)-1);
   2135  1.39  christos 		cp += clen;
   2136  1.42  christos #else
   2137  1.42  christos 		*cp++ = OPND(s);
   2138  1.42  christos #endif
   2139   1.1       jtc 	}
   2140   1.2       jtc 	assert(cp == g->must + g->mlen);
   2141   1.1       jtc 	*cp++ = '\0';		/* just on general principles */
   2142   1.1       jtc }
   2143   1.1       jtc 
   2144   1.1       jtc /*
   2145  1.39  christos  - altoffset - choose biggest offset among multiple choices
   2146  1.39  christos  == static int altoffset(sop *scan, int offset);
   2147  1.39  christos  *
   2148  1.39  christos  * Compute, recursively if necessary, the largest offset among multiple
   2149  1.39  christos  * re paths.
   2150  1.39  christos  */
   2151  1.39  christos static int
   2152  1.39  christos altoffset(sop *scan, int offset)
   2153  1.39  christos {
   2154  1.39  christos 	int largest;
   2155  1.39  christos 	int try;
   2156  1.39  christos 	sop s;
   2157  1.39  christos 
   2158  1.39  christos 	_DIAGASSERT(scan != NULL);
   2159  1.39  christos 
   2160  1.39  christos 	/* If we gave up already on offsets, return */
   2161  1.39  christos 	if (offset == -1)
   2162  1.39  christos 		return -1;
   2163  1.39  christos 
   2164  1.39  christos 	largest = 0;
   2165  1.39  christos 	try = 0;
   2166  1.39  christos 	s = *scan++;
   2167  1.40  christos 	while (OP(s) != O_QUEST && OP(s) != O_CH) {
   2168  1.39  christos 		switch (OP(s)) {
   2169  1.39  christos 		case OOR1:
   2170  1.39  christos 			if (try > largest)
   2171  1.39  christos 				largest = try;
   2172  1.39  christos 			try = 0;
   2173  1.39  christos 			break;
   2174  1.39  christos 		case OQUEST_:
   2175  1.39  christos 		case OCH_:
   2176  1.39  christos 			try = altoffset(scan, try);
   2177  1.39  christos 			if (try == -1)
   2178  1.39  christos 				return -1;
   2179  1.39  christos 			scan--;
   2180  1.39  christos 			do {
   2181  1.39  christos 				scan += OPND(s);
   2182  1.39  christos 				s = *scan;
   2183  1.40  christos 				if (OP(s) != O_QUEST &&
   2184  1.40  christos 				    OP(s) != O_CH && OP(s) != OOR2)
   2185  1.39  christos 					return -1;
   2186  1.40  christos 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
   2187  1.39  christos 			/* We must skip to the next position, or we'll
   2188  1.39  christos 			 * leave altoffset() too early.
   2189  1.39  christos 			 */
   2190  1.39  christos 			scan++;
   2191  1.39  christos 			break;
   2192  1.39  christos 		case OANYOF:
   2193  1.39  christos 		case OCHAR:
   2194  1.39  christos 		case OANY:
   2195  1.39  christos 			try++;
   2196  1.40  christos 			/*FALLTHROUGH*/
   2197  1.39  christos 		case OBOW:
   2198  1.39  christos 		case OEOW:
   2199  1.39  christos 		case OWBND:
   2200  1.39  christos 		case ONWBND:
   2201  1.39  christos 		case OLPAREN:
   2202  1.39  christos 		case ORPAREN:
   2203  1.39  christos 		case OOR2:
   2204  1.39  christos 			break;
   2205  1.39  christos 		default:
   2206  1.39  christos 			try = -1;
   2207  1.39  christos 			break;
   2208  1.39  christos 		}
   2209  1.39  christos 		if (try == -1)
   2210  1.39  christos 			return -1;
   2211  1.39  christos 		s = *scan++;
   2212  1.39  christos 	}
   2213  1.39  christos 
   2214  1.39  christos 	if (try > largest)
   2215  1.39  christos 		largest = try;
   2216  1.39  christos 
   2217  1.39  christos 	return largest+offset;
   2218  1.39  christos }
   2219  1.39  christos 
   2220  1.39  christos /*
   2221  1.39  christos  - computejumps - compute char jumps for BM scan
   2222  1.39  christos  == static void computejumps(struct parse *p, struct re_guts *g);
   2223  1.39  christos  *
   2224  1.39  christos  * This algorithm assumes g->must exists and is has size greater than
   2225  1.39  christos  * zero. It's based on the algorithm found on Computer Algorithms by
   2226  1.39  christos  * Sara Baase.
   2227  1.39  christos  *
   2228  1.39  christos  * A char jump is the number of characters one needs to jump based on
   2229  1.39  christos  * the value of the character from the text that was mismatched.
   2230  1.39  christos  */
   2231  1.39  christos static void
   2232  1.39  christos computejumps(struct parse *p, struct re_guts *g)
   2233  1.39  christos {
   2234  1.39  christos 	int ch;
   2235  1.39  christos 	size_t mindex;
   2236  1.39  christos 
   2237  1.39  christos 	_DIAGASSERT(p != NULL);
   2238  1.39  christos 	_DIAGASSERT(g != NULL);
   2239  1.39  christos 
   2240  1.39  christos 	/* Avoid making errors worse */
   2241  1.39  christos 	if (p->error != 0)
   2242  1.39  christos 		return;
   2243  1.39  christos 
   2244  1.39  christos 	g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
   2245  1.39  christos 	if (g->charjump == NULL)	/* Not a fatal error */
   2246  1.39  christos 		return;
   2247  1.39  christos 	/* Adjust for signed chars, if necessary */
   2248  1.39  christos 	g->charjump = &g->charjump[-(CHAR_MIN)];
   2249  1.39  christos 
   2250  1.39  christos 	/* If the character does not exist in the pattern, the jump
   2251  1.39  christos 	 * is equal to the number of characters in the pattern.
   2252  1.39  christos 	 */
   2253  1.39  christos 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
   2254  1.39  christos 		g->charjump[ch] = g->mlen;
   2255  1.39  christos 
   2256  1.39  christos 	/* If the character does exist, compute the jump that would
   2257  1.39  christos 	 * take us to the last character in the pattern equal to it
   2258  1.39  christos 	 * (notice that we match right to left, so that last character
   2259  1.39  christos 	 * is the first one that would be matched).
   2260  1.39  christos 	 */
   2261  1.39  christos 	for (mindex = 0; mindex < g->mlen; mindex++)
   2262  1.39  christos 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
   2263  1.39  christos }
   2264  1.39  christos 
   2265  1.39  christos /*
   2266  1.39  christos  - computematchjumps - compute match jumps for BM scan
   2267  1.39  christos  == static void computematchjumps(struct parse *p, struct re_guts *g);
   2268  1.39  christos  *
   2269  1.39  christos  * This algorithm assumes g->must exists and is has size greater than
   2270  1.39  christos  * zero. It's based on the algorithm found on Computer Algorithms by
   2271  1.39  christos  * Sara Baase.
   2272  1.39  christos  *
   2273  1.39  christos  * A match jump is the number of characters one needs to advance based
   2274  1.39  christos  * on the already-matched suffix.
   2275  1.39  christos  * Notice that all values here are minus (g->mlen-1), because of the way
   2276  1.39  christos  * the search algorithm works.
   2277  1.39  christos  */
   2278  1.39  christos static void
   2279  1.39  christos computematchjumps(struct parse *p, struct re_guts *g)
   2280  1.39  christos {
   2281  1.39  christos 	size_t mindex;		/* General "must" iterator */
   2282  1.39  christos 	size_t suffix;		/* Keeps track of matching suffix */
   2283  1.39  christos 	size_t ssuffix;		/* Keeps track of suffixes' suffix */
   2284  1.39  christos 	size_t* pmatches;	/* pmatches[k] points to the next i
   2285  1.39  christos 				 * such that i+1...mlen is a substring
   2286  1.39  christos 				 * of k+1...k+mlen-i-1
   2287  1.39  christos 				 */
   2288  1.39  christos 
   2289  1.39  christos 	_DIAGASSERT(p != NULL);
   2290  1.39  christos 	_DIAGASSERT(g != NULL);
   2291  1.39  christos 
   2292  1.39  christos 	/* Avoid making errors worse */
   2293  1.39  christos 	if (p->error != 0)
   2294  1.39  christos 		return;
   2295  1.39  christos 
   2296  1.39  christos 	pmatches = calloc(g->mlen, sizeof(*pmatches));
   2297  1.39  christos 	if (pmatches == NULL) {
   2298  1.39  christos 		g->matchjump = NULL;
   2299  1.39  christos 		return;
   2300  1.39  christos 	}
   2301  1.39  christos 
   2302  1.39  christos 	g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
   2303  1.39  christos 	if (g->matchjump == NULL) {	/* Not a fatal error */
   2304  1.39  christos 		free(pmatches);
   2305  1.39  christos 		return;
   2306  1.39  christos 	}
   2307  1.39  christos 
   2308  1.39  christos 	/* Set maximum possible jump for each character in the pattern */
   2309  1.39  christos 	for (mindex = 0; mindex < g->mlen; mindex++)
   2310  1.39  christos 		g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
   2311  1.39  christos 
   2312  1.39  christos 	/* Compute pmatches[] */
   2313  1.39  christos 	for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
   2314  1.39  christos 		pmatches[mindex] = suffix;
   2315  1.39  christos 
   2316  1.39  christos 		/* If a mismatch is found, interrupting the substring,
   2317  1.39  christos 		 * compute the matchjump for that position. If no
   2318  1.39  christos 		 * mismatch is found, then a text substring mismatched
   2319  1.39  christos 		 * against the suffix will also mismatch against the
   2320  1.39  christos 		 * substring.
   2321  1.39  christos 		 */
   2322  1.39  christos 		while (suffix < g->mlen
   2323  1.39  christos 		    && g->must[mindex] != g->must[suffix]) {
   2324  1.39  christos 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
   2325  1.39  christos 			    g->mlen - mindex - 1);
   2326  1.39  christos 			suffix = pmatches[suffix];
   2327  1.39  christos 		}
   2328  1.39  christos 	}
   2329  1.39  christos 
   2330  1.39  christos 	/* Compute the matchjump up to the last substring found to jump
   2331  1.39  christos 	 * to the beginning of the largest must pattern prefix matching
   2332  1.39  christos 	 * it's own suffix.
   2333  1.39  christos 	 */
   2334  1.39  christos 	for (mindex = 0; mindex <= suffix; mindex++)
   2335  1.39  christos 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
   2336  1.39  christos 		    g->mlen + suffix - mindex);
   2337  1.39  christos 
   2338  1.39  christos         ssuffix = pmatches[suffix];
   2339  1.39  christos         while (suffix < g->mlen) {
   2340  1.39  christos                 while (suffix <= ssuffix && suffix < g->mlen) {
   2341  1.39  christos                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
   2342  1.39  christos 			    g->mlen + ssuffix - suffix);
   2343  1.39  christos                         suffix++;
   2344  1.39  christos                 }
   2345  1.39  christos 		if (suffix < g->mlen)
   2346  1.39  christos                 	ssuffix = pmatches[ssuffix];
   2347  1.39  christos         }
   2348  1.39  christos 
   2349  1.39  christos 	free(pmatches);
   2350  1.39  christos }
   2351  1.39  christos 
   2352  1.39  christos /*
   2353   1.1       jtc  - pluscount - count + nesting
   2354   1.9     perry  == static sopno pluscount(struct parse *p, struct re_guts *g);
   2355   1.1       jtc  */
   2356   1.1       jtc static sopno			/* nesting depth */
   2357  1.39  christos pluscount(struct parse *p, struct re_guts *g)
   2358   1.1       jtc {
   2359   1.9     perry 	sop *scan;
   2360   1.9     perry 	sop s;
   2361   1.9     perry 	sopno plusnest = 0;
   2362   1.9     perry 	sopno maxnest = 0;
   2363  1.14     lukem 
   2364  1.14     lukem 	_DIAGASSERT(p != NULL);
   2365  1.14     lukem 	_DIAGASSERT(g != NULL);
   2366   1.1       jtc 
   2367   1.1       jtc 	if (p->error != 0)
   2368   1.1       jtc 		return(0);	/* there may not be an OEND */
   2369   1.1       jtc 
   2370   1.1       jtc 	scan = g->strip + 1;
   2371   1.1       jtc 	do {
   2372   1.1       jtc 		s = *scan++;
   2373   1.1       jtc 		switch (OP(s)) {
   2374   1.1       jtc 		case OPLUS_:
   2375   1.1       jtc 			plusnest++;
   2376   1.1       jtc 			break;
   2377   1.1       jtc 		case O_PLUS:
   2378   1.1       jtc 			if (plusnest > maxnest)
   2379   1.1       jtc 				maxnest = plusnest;
   2380   1.1       jtc 			plusnest--;
   2381   1.1       jtc 			break;
   2382   1.1       jtc 		}
   2383   1.1       jtc 	} while (OP(s) != OEND);
   2384   1.1       jtc 	if (plusnest != 0)
   2385   1.1       jtc 		g->iflags |= BAD;
   2386   1.1       jtc 	return(maxnest);
   2387   1.1       jtc }
   2388