Home | History | Annotate | Line # | Download | only in regex
regcomp.c revision 1.45
      1  1.45  christos /*	$NetBSD: regcomp.c,v 1.45 2021/02/26 19:24:47 christos Exp $	*/
      2   1.6       cgd 
      3   1.5       cgd /*-
      4  1.39  christos  * SPDX-License-Identifier: BSD-3-Clause
      5  1.39  christos  *
      6  1.39  christos  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
      7   1.5       cgd  * Copyright (c) 1992, 1993, 1994
      8   1.5       cgd  *	The Regents of the University of California.  All rights reserved.
      9   1.5       cgd  *
     10  1.39  christos  * Copyright (c) 2011 The FreeBSD Foundation
     11  1.39  christos  * All rights reserved.
     12  1.39  christos  * Portions of this software were developed by David Chisnall
     13  1.39  christos  * under sponsorship from the FreeBSD Foundation.
     14  1.39  christos  *
     15   1.5       cgd  * This code is derived from software contributed to Berkeley by
     16   1.5       cgd  * Henry Spencer.
     17   1.5       cgd  *
     18   1.5       cgd  * Redistribution and use in source and binary forms, with or without
     19   1.5       cgd  * modification, are permitted provided that the following conditions
     20   1.5       cgd  * are met:
     21   1.5       cgd  * 1. Redistributions of source code must retain the above copyright
     22   1.5       cgd  *    notice, this list of conditions and the following disclaimer.
     23   1.5       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     24   1.5       cgd  *    notice, this list of conditions and the following disclaimer in the
     25   1.5       cgd  *    documentation and/or other materials provided with the distribution.
     26  1.18       agc  * 3. Neither the name of the University nor the names of its contributors
     27  1.18       agc  *    may be used to endorse or promote products derived from this software
     28  1.18       agc  *    without specific prior written permission.
     29  1.18       agc  *
     30  1.18       agc  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     31  1.18       agc  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     32  1.18       agc  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     33  1.18       agc  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     34  1.18       agc  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     35  1.18       agc  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     36  1.18       agc  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     37  1.18       agc  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     38  1.18       agc  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     39  1.18       agc  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     40  1.18       agc  * SUCH DAMAGE.
     41  1.18       agc  *
     42  1.18       agc  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
     43  1.18       agc  */
     44  1.18       agc 
     45  1.45  christos #if HAVE_NBTOOL_CONFIG_H
     46  1.45  christos #include "nbtool_config.h"
     47  1.45  christos #endif
     48  1.45  christos 
     49   1.7  christos #include <sys/cdefs.h>
     50   1.6       cgd #if 0
     51   1.5       cgd static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
     52  1.39  christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
     53   1.6       cgd #endif
     54  1.45  christos __RCSID("$NetBSD: regcomp.c,v 1.45 2021/02/26 19:24:47 christos Exp $");
     55  1.39  christos 
     56  1.39  christos #define _OPENBSD_SOURCE
     57  1.42  christos 
     58  1.42  christos #ifndef LIBHACK
     59  1.39  christos #define REGEX_GNU_EXTENSIONS
     60   1.5       cgd 
     61   1.8       jtc #include "namespace.h"
     62  1.42  christos #endif
     63   1.1       jtc #include <sys/types.h>
     64  1.39  christos #include <stdio.h>
     65  1.39  christos #include <string.h>
     66   1.1       jtc #include <ctype.h>
     67   1.1       jtc #include <limits.h>
     68   1.1       jtc #include <stdlib.h>
     69  1.28  junyoung #include <regex.h>
     70  1.39  christos #include <stdbool.h>
     71   1.8       jtc 
     72  1.44  christos #if defined(__weak_alias) && !defined(LIBHACK)
     73  1.16   mycroft __weak_alias(regcomp,_regcomp)
     74   1.8       jtc #endif
     75   1.1       jtc 
     76  1.39  christos #ifdef REGEX_LIBC_COLLATE
     77  1.39  christos #include "collate.h"
     78  1.39  christos #endif
     79  1.39  christos 
     80   1.1       jtc #include "utils.h"
     81   1.1       jtc #include "regex2.h"
     82   1.1       jtc 
     83   1.1       jtc #include "cname.h"
     84   1.1       jtc 
     85   1.1       jtc /*
     86  1.39  christos  * Branching context, used to keep track of branch state for all of the branch-
     87  1.39  christos  * aware functions. In addition to keeping track of branch positions for the
     88  1.39  christos  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
     89  1.39  christos  * detection of whether ^ is acting as an anchor or being used erroneously and
     90  1.39  christos  * also for whether we're in a sub-expression or not.
     91  1.39  christos  */
     92  1.39  christos struct branchc {
     93  1.39  christos 	sopno start;
     94  1.39  christos 	sopno back;
     95  1.39  christos 	sopno fwd;
     96  1.39  christos 
     97  1.39  christos 	int nbranch;
     98  1.39  christos 	int nchain;
     99  1.39  christos 	bool outer;
    100  1.39  christos 	bool terminate;
    101  1.39  christos };
    102  1.39  christos 
    103  1.39  christos /*
    104   1.1       jtc  * parse structure, passed up and down to avoid global variables and
    105   1.1       jtc  * other clumsinesses
    106   1.1       jtc  */
    107   1.1       jtc struct parse {
    108  1.21      yamt 	const char *next;	/* next character in RE */
    109  1.21      yamt 	const char *end;	/* end of string (-> NUL normally) */
    110   1.1       jtc 	int error;		/* has an error been seen? */
    111  1.39  christos 	int gnuext;
    112   1.1       jtc 	sop *strip;		/* malloced strip */
    113   1.1       jtc 	sopno ssize;		/* malloced strip size (allocated) */
    114   1.1       jtc 	sopno slen;		/* malloced strip length (used) */
    115  1.33  christos 	size_t ncsalloc;	/* number of csets allocated */
    116   1.1       jtc 	struct re_guts *g;
    117   1.1       jtc #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
    118   1.1       jtc 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
    119   1.1       jtc 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
    120  1.39  christos 	bool allowbranch;	/* can this expression branch? */
    121  1.39  christos 	bool bre;		/* convenience; is this a BRE? */
    122  1.39  christos 	int pflags;		/* other parsing flags -- legacy escapes? */
    123  1.39  christos 	bool (*parse_expr)(struct parse *, struct branchc *);
    124  1.39  christos 	void (*pre_parse)(struct parse *, struct branchc *);
    125  1.39  christos 	void (*post_parse)(struct parse *, struct branchc *);
    126   1.1       jtc };
    127   1.1       jtc 
    128  1.39  christos #define PFLAG_LEGACY_ESC	0x00000001
    129  1.39  christos 
    130   1.5       cgd /* ========= begin header generated by ./mkh ========= */
    131   1.5       cgd #ifdef __cplusplus
    132   1.5       cgd extern "C" {
    133   1.5       cgd #endif
    134   1.5       cgd 
    135   1.5       cgd /* === regcomp.c === */
    136  1.39  christos static bool p_ere_exp(struct parse *p, struct branchc *bc);
    137  1.26  junyoung static void p_str(struct parse *p);
    138  1.39  christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
    139  1.39  christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
    140  1.39  christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
    141  1.39  christos static bool p_branch_empty(struct parse *p, struct branchc *bc);
    142  1.39  christos static bool p_branch_do(struct parse *p, struct branchc *bc);
    143  1.39  christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
    144  1.39  christos static void p_bre_post_parse(struct parse *p, struct branchc *bc);
    145  1.39  christos static void p_re(struct parse *p, int end1, int end2);
    146  1.39  christos static bool p_simp_re(struct parse *p, struct branchc *bc);
    147  1.26  junyoung static int p_count(struct parse *p);
    148  1.26  junyoung static void p_bracket(struct parse *p);
    149  1.39  christos static int p_range_cmp(wchar_t c1, wchar_t c2);
    150  1.26  junyoung static void p_b_term(struct parse *p, cset *cs);
    151  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    152  1.39  christos static int p_b_pseudoclass(struct parse *p, char c);
    153  1.39  christos #endif
    154  1.26  junyoung static void p_b_cclass(struct parse *p, cset *cs);
    155  1.39  christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
    156  1.26  junyoung static void p_b_eclass(struct parse *p, cset *cs);
    157  1.39  christos static wint_t p_b_symbol(struct parse *p);
    158  1.39  christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
    159  1.39  christos static bool may_escape(struct parse *p, const wint_t ch);
    160  1.39  christos static wint_t othercase(wint_t ch);
    161  1.39  christos static void bothcases(struct parse *p, wint_t ch);
    162  1.39  christos static void ordinary(struct parse *p, wint_t ch);
    163  1.26  junyoung static void nonnewline(struct parse *p);
    164  1.39  christos static void repeat(struct parse *p, sopno start, int from, int to);
    165  1.26  junyoung static int seterr(struct parse *p, int e);
    166  1.26  junyoung static cset *allocset(struct parse *p);
    167  1.26  junyoung static void freeset(struct parse *p, cset *cs);
    168  1.39  christos static void CHadd(struct parse *p, cset *cs, wint_t ch);
    169  1.39  christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
    170  1.39  christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
    171  1.39  christos static wint_t singleton(cset *cs);
    172  1.26  junyoung static sopno dupl(struct parse *p, sopno start, sopno finish);
    173  1.39  christos static void doemit(struct parse *p, sop op, size_t opnd);
    174  1.39  christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
    175  1.39  christos static void dofwd(struct parse *p, sopno pos, sop value);
    176  1.30  christos static int enlarge(struct parse *p, sopno size);
    177  1.26  junyoung static void stripsnug(struct parse *p, struct re_guts *g);
    178  1.26  junyoung static void findmust(struct parse *p, struct re_guts *g);
    179  1.39  christos static int altoffset(sop *scan, int offset);
    180  1.39  christos static void computejumps(struct parse *p, struct re_guts *g);
    181  1.39  christos static void computematchjumps(struct parse *p, struct re_guts *g);
    182  1.26  junyoung static sopno pluscount(struct parse *p, struct re_guts *g);
    183  1.39  christos static wint_t wgetnext(struct parse *p);
    184   1.5       cgd 
    185   1.5       cgd #ifdef __cplusplus
    186   1.5       cgd }
    187   1.5       cgd #endif
    188   1.5       cgd /* ========= end header generated by ./mkh ========= */
    189   1.1       jtc 
    190   1.1       jtc static char nuls[10];		/* place to point scanner in event of error */
    191   1.1       jtc 
    192   1.1       jtc /*
    193   1.1       jtc  * macros for use with parse structure
    194   1.1       jtc  * BEWARE:  these know that the parse structure is named `p' !!!
    195   1.1       jtc  */
    196   1.1       jtc #define	PEEK()	(*p->next)
    197   1.1       jtc #define	PEEK2()	(*(p->next+1))
    198   1.1       jtc #define	MORE()	(p->next < p->end)
    199   1.1       jtc #define	MORE2()	(p->next+1 < p->end)
    200   1.1       jtc #define	SEE(c)	(MORE() && PEEK() == (c))
    201   1.1       jtc #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
    202  1.39  christos #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
    203   1.1       jtc #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
    204   1.1       jtc #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
    205  1.39  christos #define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
    206   1.1       jtc #define	NEXT()	(p->next++)
    207   1.1       jtc #define	NEXT2()	(p->next += 2)
    208   1.1       jtc #define	NEXTn(n)	(p->next += (n))
    209   1.1       jtc #define	GETNEXT()	(*p->next++)
    210  1.39  christos #define	WGETNEXT()	wgetnext(p)
    211   1.1       jtc #define	SETERROR(e)	seterr(p, (e))
    212  1.39  christos #define	REQUIRE(co, e)	((co) || SETERROR(e))
    213   1.1       jtc #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
    214  1.39  christos #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
    215   1.1       jtc #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
    216  1.40  christos #define	EMIT(op, sopnd)	doemit(p, (op), (sopnd))
    217  1.40  christos #define	INSERT(op, pos)	doinsert(p, (op), HERE()-(pos)+1, pos)
    218  1.12  drochner #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
    219   1.1       jtc #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
    220   1.1       jtc #define	HERE()		(p->slen)
    221   1.1       jtc #define	THERE()		(p->slen - 1)
    222   1.4       jtc #define	THERETHERE()	(p->slen - 2)
    223   1.1       jtc #define	DROP(n)	(p->slen -= (n))
    224   1.1       jtc 
    225  1.39  christos /* Macro used by computejump()/computematchjump() */
    226  1.41  christos #ifndef MIN
    227  1.39  christos #define MIN(a,b)	((a)<(b)?(a):(b))
    228  1.41  christos #endif
    229  1.30  christos 
    230  1.42  christos #ifndef NLS
    231  1.42  christos static const struct {
    232  1.42  christos 	const char *name;
    233  1.42  christos 	int (*func)(int);
    234  1.42  christos } wctypes[] = {
    235  1.42  christos #define ADD(x) { .name = # x, .func = is ## x }
    236  1.42  christos 	ADD(alnum),
    237  1.42  christos 	ADD(alpha),
    238  1.42  christos 	ADD(blank),
    239  1.42  christos 	ADD(cntrl),
    240  1.42  christos 	ADD(digit),
    241  1.42  christos 	ADD(graph),
    242  1.42  christos 	ADD(lower),
    243  1.42  christos 	ADD(print),
    244  1.42  christos 	ADD(punct),
    245  1.42  christos 	ADD(space),
    246  1.42  christos 	ADD(upper),
    247  1.42  christos 	ADD(xdigit),
    248  1.42  christos #undef ADD
    249  1.42  christos };
    250  1.42  christos 
    251  1.42  christos wctype_t
    252  1.42  christos __regex_wctype(const char *str)
    253  1.42  christos {
    254  1.42  christos 	for (size_t i = 0; i < __arraycount(wctypes); i++) {
    255  1.42  christos 		if (strcmp(wctypes[i].name, str) == 0)
    256  1.42  christos 			return (wctype_t)(i + 1);
    257  1.42  christos 	}
    258  1.42  christos 	return (wctype_t)0;
    259  1.42  christos }
    260  1.42  christos 
    261  1.42  christos int
    262  1.42  christos __regex_iswctype(wint_t c, wctype_t ct)
    263  1.42  christos {
    264  1.42  christos 	if (ct == 0)
    265  1.42  christos 		return 0;
    266  1.42  christos 	return (*wctypes[ct - 1].func)(c);
    267  1.42  christos }
    268  1.42  christos #endif
    269  1.42  christos 
    270  1.39  christos static int				/* 0 success, otherwise REG_something */
    271  1.39  christos regcomp_internal(regex_t * __restrict preg,
    272  1.39  christos 	const char * __restrict pattern,
    273  1.39  christos 	int cflags, int pflags)
    274   1.1       jtc {
    275   1.1       jtc 	struct parse pa;
    276   1.9     perry 	struct re_guts *g;
    277   1.9     perry 	struct parse *p = &pa;
    278   1.9     perry 	int i;
    279   1.9     perry 	size_t len;
    280  1.39  christos 	size_t maxlen;
    281   1.3       jtc #ifdef REDEBUG
    282   1.3       jtc #	define	GOODFLAGS(f)	(f)
    283   1.3       jtc #else
    284   1.3       jtc #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
    285   1.3       jtc #endif
    286   1.1       jtc 
    287  1.14     lukem 	_DIAGASSERT(preg != NULL);
    288  1.14     lukem 	_DIAGASSERT(pattern != NULL);
    289  1.14     lukem 
    290   1.3       jtc 	cflags = GOODFLAGS(cflags);
    291   1.1       jtc 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
    292   1.1       jtc 		return(REG_INVARG);
    293   1.1       jtc 
    294   1.1       jtc 	if (cflags&REG_PEND) {
    295   1.1       jtc 		if (preg->re_endp < pattern)
    296   1.1       jtc 			return(REG_INVARG);
    297   1.1       jtc 		len = preg->re_endp - pattern;
    298   1.1       jtc 	} else
    299  1.11  christos 		len = strlen(pattern);
    300   1.1       jtc 
    301   1.1       jtc 	/* do the mallocs early so failure handling is easy */
    302  1.39  christos 	g = malloc(sizeof(*g));
    303   1.1       jtc 	if (g == NULL)
    304   1.1       jtc 		return(REG_ESPACE);
    305  1.39  christos 	/*
    306  1.39  christos 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
    307  1.39  christos 	 * extension.  Also avoid any signed overflow in case of conversion
    308  1.39  christos 	 * so make the real limit based on a 31-bit overflow.
    309  1.39  christos 	 *
    310  1.39  christos 	 * Likely not applicable on 64-bit systems but handle the case
    311  1.39  christos 	 * generically (who are we to stop people from using ~715MB+
    312  1.39  christos 	 * patterns?).
    313  1.39  christos 	 */
    314  1.40  christos 	maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
    315  1.39  christos 	if (len >= maxlen) {
    316  1.40  christos 		free(g);
    317  1.39  christos 		return(REG_ESPACE);
    318  1.39  christos 	}
    319  1.40  christos 	p->ssize = (sopno)(len / 2 * 3 + 1);	/* ugh */
    320  1.39  christos 	assert(p->ssize >= len);
    321  1.39  christos 
    322  1.39  christos 	p->strip = calloc(p->ssize, sizeof(*p->strip));
    323   1.1       jtc 	p->slen = 0;
    324   1.1       jtc 	if (p->strip == NULL) {
    325  1.40  christos 		free(g);
    326   1.1       jtc 		return(REG_ESPACE);
    327   1.1       jtc 	}
    328   1.1       jtc 
    329   1.1       jtc 	/* set things up */
    330   1.1       jtc 	p->g = g;
    331  1.39  christos 	p->next = pattern;	/* convenience; we do not modify it */
    332   1.1       jtc 	p->end = p->next + len;
    333   1.1       jtc 	p->error = 0;
    334   1.1       jtc 	p->ncsalloc = 0;
    335  1.39  christos 	p->pflags = pflags;
    336   1.1       jtc 	for (i = 0; i < NPAREN; i++) {
    337   1.1       jtc 		p->pbegin[i] = 0;
    338   1.1       jtc 		p->pend[i] = 0;
    339   1.1       jtc 	}
    340  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    341  1.39  christos 	if ((cflags & REG_GNU) == 0) {
    342  1.39  christos 		p->gnuext = false;
    343  1.39  christos 		p->allowbranch = (cflags & REG_EXTENDED) != 0;
    344  1.39  christos 	} else
    345  1.39  christos 		p->gnuext = p->allowbranch = true;
    346  1.39  christos #else
    347  1.39  christos 	p->gnuext = false;
    348  1.39  christos 	p->allowbranch = (cflags & REG_EXTENDED) != 0;
    349  1.39  christos #endif
    350  1.39  christos 	if (cflags & REG_EXTENDED) {
    351  1.39  christos 		p->bre = false;
    352  1.39  christos 		p->parse_expr = p_ere_exp;
    353  1.39  christos 		p->pre_parse = NULL;
    354  1.39  christos 		p->post_parse = NULL;
    355  1.39  christos 	} else {
    356  1.39  christos 		p->bre = true;
    357  1.39  christos 		p->parse_expr = p_simp_re;
    358  1.39  christos 		p->pre_parse = p_bre_pre_parse;
    359  1.39  christos 		p->post_parse = p_bre_post_parse;
    360  1.39  christos 	}
    361   1.1       jtc 	g->sets = NULL;
    362   1.1       jtc 	g->ncsets = 0;
    363   1.1       jtc 	g->cflags = cflags;
    364   1.1       jtc 	g->iflags = 0;
    365   1.1       jtc 	g->nbol = 0;
    366   1.1       jtc 	g->neol = 0;
    367   1.1       jtc 	g->must = NULL;
    368  1.39  christos 	g->moffset = -1;
    369  1.39  christos 	g->charjump = NULL;
    370  1.39  christos 	g->matchjump = NULL;
    371   1.1       jtc 	g->mlen = 0;
    372   1.1       jtc 	g->nsub = 0;
    373   1.1       jtc 	g->backrefs = 0;
    374   1.1       jtc 
    375   1.1       jtc 	/* do it */
    376   1.1       jtc 	EMIT(OEND, 0);
    377   1.1       jtc 	g->firststate = THERE();
    378  1.39  christos 	if (cflags & REG_NOSPEC)
    379   1.1       jtc 		p_str(p);
    380   1.1       jtc 	else
    381  1.39  christos 		p_re(p, OUT, OUT);
    382   1.1       jtc 	EMIT(OEND, 0);
    383   1.1       jtc 	g->laststate = THERE();
    384   1.1       jtc 
    385   1.1       jtc 	/* tidy up loose ends and fill things in */
    386   1.1       jtc 	stripsnug(p, g);
    387   1.1       jtc 	findmust(p, g);
    388  1.39  christos 	/* only use Boyer-Moore algorithm if the pattern is bigger
    389  1.39  christos 	 * than three characters
    390  1.39  christos 	 */
    391  1.39  christos 	if(g->mlen > 3) {
    392  1.39  christos 		computejumps(p, g);
    393  1.39  christos 		computematchjumps(p, g);
    394  1.39  christos 		if(g->matchjump == NULL && g->charjump != NULL) {
    395  1.39  christos 			free(g->charjump);
    396  1.39  christos 			g->charjump = NULL;
    397  1.39  christos 		}
    398  1.39  christos 	}
    399   1.1       jtc 	g->nplus = pluscount(p, g);
    400   1.1       jtc 	g->magic = MAGIC2;
    401   1.1       jtc 	preg->re_nsub = g->nsub;
    402   1.1       jtc 	preg->re_g = g;
    403   1.1       jtc 	preg->re_magic = MAGIC1;
    404   1.1       jtc #ifndef REDEBUG
    405   1.1       jtc 	/* not debugging, so can't rely on the assert() in regexec() */
    406   1.1       jtc 	if (g->iflags&BAD)
    407   1.1       jtc 		SETERROR(REG_ASSERT);
    408   1.1       jtc #endif
    409   1.1       jtc 
    410   1.1       jtc 	/* win or lose, we're done */
    411   1.1       jtc 	if (p->error != 0)	/* lose */
    412   1.1       jtc 		regfree(preg);
    413   1.1       jtc 	return(p->error);
    414   1.1       jtc }
    415   1.1       jtc 
    416   1.1       jtc /*
    417  1.39  christos  - regcomp - interface for parser and compilation
    418  1.39  christos  = extern int regcomp(regex_t *, const char *, int);
    419  1.39  christos  = #define	REG_BASIC	0000
    420  1.39  christos  = #define	REG_EXTENDED	0001
    421  1.39  christos  = #define	REG_ICASE	0002
    422  1.39  christos  = #define	REG_NOSUB	0004
    423  1.39  christos  = #define	REG_NEWLINE	0010
    424  1.39  christos  = #define	REG_NOSPEC	0020
    425  1.39  christos  = #define	REG_PEND	0040
    426  1.39  christos  = #define	REG_DUMP	0200
    427   1.1       jtc  */
    428  1.39  christos int				/* 0 success, otherwise REG_something */
    429  1.39  christos regcomp(regex_t * __restrict preg,
    430  1.39  christos 	const char * __restrict pattern,
    431  1.39  christos 	int cflags)
    432   1.1       jtc {
    433  1.30  christos 
    434  1.39  christos 	return (regcomp_internal(preg, pattern, cflags, 0));
    435   1.1       jtc }
    436   1.1       jtc 
    437   1.1       jtc /*
    438  1.39  christos  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
    439  1.39  christos  - return whether we should terminate or not
    440  1.39  christos  == static bool p_ere_exp(struct parse *p);
    441   1.1       jtc  */
    442  1.39  christos static bool
    443  1.39  christos p_ere_exp(struct parse *p, struct branchc *bc)
    444   1.1       jtc {
    445   1.9     perry 	char c;
    446  1.39  christos 	wint_t wc;
    447   1.9     perry 	sopno pos;
    448   1.9     perry 	int count;
    449   1.9     perry 	int count2;
    450  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    451  1.40  christos 	size_t i;
    452  1.39  christos 	int handled;
    453  1.39  christos #endif
    454   1.9     perry 	sopno subno;
    455   1.1       jtc 	int wascaret = 0;
    456   1.1       jtc 
    457  1.14     lukem 	_DIAGASSERT(p != NULL);
    458  1.14     lukem 
    459  1.39  christos 	(void)bc;
    460   1.1       jtc 	assert(MORE());		/* caller should have ensured this */
    461   1.1       jtc 	c = GETNEXT();
    462   1.1       jtc 
    463  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    464  1.39  christos 	handled = 0;
    465  1.39  christos #endif
    466   1.1       jtc 	pos = HERE();
    467   1.1       jtc 	switch (c) {
    468   1.1       jtc 	case '(':
    469  1.39  christos 		(void)REQUIRE(MORE(), REG_EPAREN);
    470   1.1       jtc 		p->g->nsub++;
    471  1.40  christos 		subno = (sopno)p->g->nsub;
    472   1.1       jtc 		if (subno < NPAREN)
    473   1.1       jtc 			p->pbegin[subno] = HERE();
    474   1.1       jtc 		EMIT(OLPAREN, subno);
    475   1.1       jtc 		if (!SEE(')'))
    476  1.39  christos 			p_re(p, ')', IGN);
    477   1.1       jtc 		if (subno < NPAREN) {
    478   1.1       jtc 			p->pend[subno] = HERE();
    479   1.1       jtc 			assert(p->pend[subno] != 0);
    480   1.1       jtc 		}
    481   1.1       jtc 		EMIT(ORPAREN, subno);
    482  1.39  christos 		(void)MUSTEAT(')', REG_EPAREN);
    483   1.1       jtc 		break;
    484   1.1       jtc #ifndef POSIX_MISTAKE
    485   1.1       jtc 	case ')':		/* happens only if no current unmatched ( */
    486   1.1       jtc 		/*
    487   1.1       jtc 		 * You may ask, why the ifndef?  Because I didn't notice
    488   1.1       jtc 		 * this until slightly too late for 1003.2, and none of the
    489   1.1       jtc 		 * other 1003.2 regular-expression reviewers noticed it at
    490   1.1       jtc 		 * all.  So an unmatched ) is legal POSIX, at least until
    491   1.1       jtc 		 * we can get it fixed.
    492   1.1       jtc 		 */
    493   1.1       jtc 		SETERROR(REG_EPAREN);
    494   1.1       jtc 		break;
    495   1.1       jtc #endif
    496   1.1       jtc 	case '^':
    497   1.1       jtc 		EMIT(OBOL, 0);
    498   1.1       jtc 		p->g->iflags |= USEBOL;
    499   1.1       jtc 		p->g->nbol++;
    500   1.1       jtc 		wascaret = 1;
    501   1.1       jtc 		break;
    502   1.1       jtc 	case '$':
    503   1.1       jtc 		EMIT(OEOL, 0);
    504   1.1       jtc 		p->g->iflags |= USEEOL;
    505   1.1       jtc 		p->g->neol++;
    506   1.1       jtc 		break;
    507   1.1       jtc 	case '|':
    508   1.1       jtc 		SETERROR(REG_EMPTY);
    509   1.1       jtc 		break;
    510   1.1       jtc 	case '*':
    511   1.1       jtc 	case '+':
    512   1.1       jtc 	case '?':
    513  1.39  christos 	case '{':
    514   1.1       jtc 		SETERROR(REG_BADRPT);
    515   1.1       jtc 		break;
    516   1.1       jtc 	case '.':
    517   1.1       jtc 		if (p->g->cflags&REG_NEWLINE)
    518   1.1       jtc 			nonnewline(p);
    519   1.1       jtc 		else
    520   1.1       jtc 			EMIT(OANY, 0);
    521   1.1       jtc 		break;
    522   1.1       jtc 	case '[':
    523   1.1       jtc 		p_bracket(p);
    524   1.1       jtc 		break;
    525   1.1       jtc 	case '\\':
    526  1.39  christos 		(void)REQUIRE(MORE(), REG_EESCAPE);
    527  1.39  christos 		wc = WGETNEXT();
    528  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    529  1.39  christos 		if (p->gnuext) {
    530  1.39  christos 			handled = 1;
    531  1.39  christos 			switch (wc) {
    532  1.39  christos 			case '`':
    533  1.39  christos 				EMIT(OBOS, 0);
    534  1.39  christos 				break;
    535  1.39  christos 			case '\'':
    536  1.39  christos 				EMIT(OEOS, 0);
    537  1.39  christos 				break;
    538  1.39  christos 			case 'B':
    539  1.39  christos 				EMIT(ONWBND, 0);
    540  1.39  christos 				break;
    541  1.39  christos 			case 'b':
    542  1.39  christos 				EMIT(OWBND, 0);
    543  1.39  christos 				break;
    544  1.39  christos 			case 'W':
    545  1.39  christos 			case 'w':
    546  1.39  christos 			case 'S':
    547  1.39  christos 			case 's':
    548  1.39  christos 				p_b_pseudoclass(p, wc);
    549  1.39  christos 				break;
    550  1.39  christos 			case '1':
    551  1.39  christos 			case '2':
    552  1.39  christos 			case '3':
    553  1.39  christos 			case '4':
    554  1.39  christos 			case '5':
    555  1.39  christos 			case '6':
    556  1.39  christos 			case '7':
    557  1.39  christos 			case '8':
    558  1.39  christos 			case '9':
    559  1.39  christos 				i = wc - '0';
    560  1.39  christos 				assert(i < NPAREN);
    561  1.39  christos 				if (p->pend[i] != 0) {
    562  1.39  christos 					assert(i <= p->g->nsub);
    563  1.39  christos 					EMIT(OBACK_, i);
    564  1.39  christos 					assert(p->pbegin[i] != 0);
    565  1.39  christos 					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
    566  1.39  christos 					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
    567  1.39  christos 					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
    568  1.39  christos 					EMIT(O_BACK, i);
    569  1.39  christos 				} else
    570  1.39  christos 					SETERROR(REG_ESUBREG);
    571  1.39  christos 				p->g->backrefs = 1;
    572  1.39  christos 				break;
    573  1.39  christos 			default:
    574  1.39  christos 				handled = 0;
    575  1.39  christos 			}
    576  1.39  christos 			/* Don't proceed to the POSIX bits if we've already handled it */
    577  1.39  christos 			if (handled)
    578  1.39  christos 				break;
    579  1.39  christos 		}
    580  1.39  christos #endif
    581  1.39  christos 		switch (wc) {
    582  1.39  christos 		case '<':
    583  1.39  christos 			EMIT(OBOW, 0);
    584  1.39  christos 			break;
    585  1.39  christos 		case '>':
    586  1.39  christos 			EMIT(OEOW, 0);
    587  1.39  christos 			break;
    588  1.39  christos 		default:
    589  1.39  christos 			if (may_escape(p, wc))
    590  1.39  christos 				ordinary(p, wc);
    591  1.39  christos 			else
    592  1.39  christos 				SETERROR(REG_EESCAPE);
    593  1.39  christos 			break;
    594  1.39  christos 		}
    595   1.1       jtc 		break;
    596   1.1       jtc 	default:
    597  1.38  christos 		if (p->error != 0)
    598  1.39  christos 			return (false);
    599  1.39  christos 		p->next--;
    600  1.39  christos 		wc = WGETNEXT();
    601  1.39  christos 		ordinary(p, wc);
    602   1.1       jtc 		break;
    603   1.1       jtc 	}
    604   1.1       jtc 
    605   1.1       jtc 	if (!MORE())
    606  1.39  christos 		return (false);
    607   1.1       jtc 	c = PEEK();
    608   1.1       jtc 	/* we call { a repetition if followed by a digit */
    609  1.39  christos 	if (!( c == '*' || c == '+' || c == '?' || c == '{'))
    610  1.39  christos 		return (false);		/* no repetition, we're done */
    611  1.39  christos 	else if (c == '{')
    612  1.39  christos 		(void)REQUIRE(MORE2() && \
    613  1.39  christos 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
    614   1.1       jtc 	NEXT();
    615   1.1       jtc 
    616  1.39  christos 	(void)REQUIRE(!wascaret, REG_BADRPT);
    617   1.1       jtc 	switch (c) {
    618   1.1       jtc 	case '*':	/* implemented as +? */
    619   1.4       jtc 		/* this case does not require the (y|) trick, noKLUDGE */
    620   1.1       jtc 		INSERT(OPLUS_, pos);
    621   1.1       jtc 		ASTERN(O_PLUS, pos);
    622   1.1       jtc 		INSERT(OQUEST_, pos);
    623   1.1       jtc 		ASTERN(O_QUEST, pos);
    624   1.1       jtc 		break;
    625   1.1       jtc 	case '+':
    626   1.1       jtc 		INSERT(OPLUS_, pos);
    627   1.1       jtc 		ASTERN(O_PLUS, pos);
    628   1.1       jtc 		break;
    629   1.1       jtc 	case '?':
    630   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
    631   1.4       jtc 		INSERT(OCH_, pos);		/* offset slightly wrong */
    632   1.4       jtc 		ASTERN(OOR1, pos);		/* this one's right */
    633   1.4       jtc 		AHEAD(pos);			/* fix the OCH_ */
    634   1.4       jtc 		EMIT(OOR2, 0);			/* offset very wrong... */
    635   1.4       jtc 		AHEAD(THERE());			/* ...so fix it */
    636   1.4       jtc 		ASTERN(O_CH, THERETHERE());
    637   1.1       jtc 		break;
    638   1.1       jtc 	case '{':
    639   1.1       jtc 		count = p_count(p);
    640   1.1       jtc 		if (EAT(',')) {
    641  1.39  christos 			if (isdigit((uch)PEEK())) {
    642   1.1       jtc 				count2 = p_count(p);
    643  1.39  christos 				(void)REQUIRE(count <= count2, REG_BADBR);
    644   1.1       jtc 			} else		/* single number with comma */
    645   1.1       jtc 				count2 = INFINITY;
    646   1.1       jtc 		} else		/* just a single number */
    647   1.1       jtc 			count2 = count;
    648  1.39  christos 		repeat(p, pos, count, count2);
    649   1.1       jtc 		if (!EAT('}')) {	/* error heuristics */
    650   1.1       jtc 			while (MORE() && PEEK() != '}')
    651   1.1       jtc 				NEXT();
    652  1.39  christos 			(void)REQUIRE(MORE(), REG_EBRACE);
    653   1.1       jtc 			SETERROR(REG_BADBR);
    654   1.1       jtc 		}
    655   1.1       jtc 		break;
    656   1.1       jtc 	}
    657   1.1       jtc 
    658   1.1       jtc 	if (!MORE())
    659  1.39  christos 		return (false);
    660   1.1       jtc 	c = PEEK();
    661   1.1       jtc 	if (!( c == '*' || c == '+' || c == '?' ||
    662  1.39  christos 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
    663  1.39  christos 		return (false);
    664   1.1       jtc 	SETERROR(REG_BADRPT);
    665  1.39  christos 	return (false);
    666   1.1       jtc }
    667   1.1       jtc 
    668   1.1       jtc /*
    669   1.1       jtc  - p_str - string (no metacharacters) "parser"
    670   1.9     perry  == static void p_str(struct parse *p);
    671   1.1       jtc  */
    672   1.1       jtc static void
    673  1.39  christos p_str(struct parse *p)
    674   1.1       jtc {
    675  1.39  christos 	(void)REQUIRE(MORE(), REG_EMPTY);
    676  1.39  christos 	while (MORE())
    677  1.39  christos 		ordinary(p, WGETNEXT());
    678  1.39  christos }
    679  1.14     lukem 
    680  1.39  christos /*
    681  1.39  christos  * Eat consecutive branch delimiters for the kind of expression that we are
    682  1.39  christos  * parsing, return the number of delimiters that we ate.
    683  1.39  christos  */
    684  1.39  christos static int
    685  1.39  christos p_branch_eat_delim(struct parse *p, struct branchc *bc)
    686  1.39  christos {
    687  1.39  christos 	int nskip;
    688  1.14     lukem 
    689  1.39  christos 	(void)bc;
    690  1.39  christos 	nskip = 0;
    691  1.39  christos 	while (EATSPEC('|'))
    692  1.39  christos 		++nskip;
    693  1.39  christos 	return (nskip);
    694   1.1       jtc }
    695   1.1       jtc 
    696   1.1       jtc /*
    697  1.39  christos  * Insert necessary branch book-keeping operations. This emits a
    698  1.39  christos  * bogus 'next' offset, since we still have more to parse
    699   1.1       jtc  */
    700   1.1       jtc static void
    701  1.39  christos p_branch_ins_offset(struct parse *p, struct branchc *bc)
    702   1.9     perry {
    703   1.1       jtc 
    704  1.39  christos 	if (bc->nbranch == 0) {
    705  1.39  christos 		INSERT(OCH_, bc->start);	/* offset is wrong */
    706  1.39  christos 		bc->fwd = bc->start;
    707  1.39  christos 		bc->back = bc->start;
    708  1.39  christos 	}
    709  1.39  christos 
    710  1.39  christos 	ASTERN(OOR1, bc->back);
    711  1.39  christos 	bc->back = THERE();
    712  1.39  christos 	AHEAD(bc->fwd);			/* fix previous offset */
    713  1.39  christos 	bc->fwd = HERE();
    714  1.39  christos 	EMIT(OOR2, 0);			/* offset is very wrong */
    715  1.39  christos 	++bc->nbranch;
    716  1.39  christos }
    717  1.39  christos 
    718  1.39  christos /*
    719  1.39  christos  * Fix the offset of the tail branch, if we actually had any branches.
    720  1.39  christos  * This is to correct the bogus placeholder offset that we use.
    721  1.39  christos  */
    722  1.39  christos static void
    723  1.39  christos p_branch_fix_tail(struct parse *p, struct branchc *bc)
    724  1.39  christos {
    725  1.14     lukem 
    726  1.39  christos 	/* Fix bogus offset at the tail if we actually have branches */
    727  1.39  christos 	if (bc->nbranch > 0) {
    728  1.39  christos 		AHEAD(bc->fwd);
    729  1.39  christos 		ASTERN(O_CH, bc->back);
    730  1.30  christos 	}
    731  1.39  christos }
    732  1.39  christos 
    733  1.39  christos /*
    734  1.39  christos  * Signal to the parser that an empty branch has been encountered; this will,
    735  1.39  christos  * in the future, be used to allow for more permissive behavior with empty
    736  1.39  christos  * branches. The return value should indicate whether parsing may continue
    737  1.39  christos  * or not.
    738  1.39  christos  */
    739  1.39  christos static bool
    740  1.39  christos p_branch_empty(struct parse *p, struct branchc *bc)
    741  1.39  christos {
    742  1.39  christos 
    743  1.39  christos 	(void)bc;
    744  1.39  christos 	SETERROR(REG_EMPTY);
    745  1.39  christos 	return (false);
    746  1.39  christos }
    747  1.39  christos 
    748  1.39  christos /*
    749  1.39  christos  * Take care of any branching requirements. This includes inserting the
    750  1.39  christos  * appropriate branching instructions as well as eating all of the branch
    751  1.39  christos  * delimiters until we either run out of pattern or need to parse more pattern.
    752  1.39  christos  */
    753  1.39  christos static bool
    754  1.39  christos p_branch_do(struct parse *p, struct branchc *bc)
    755  1.39  christos {
    756  1.39  christos 	int ate = 0;
    757  1.39  christos 
    758  1.39  christos 	ate = p_branch_eat_delim(p, bc);
    759  1.39  christos 	if (ate == 0)
    760  1.39  christos 		return (false);
    761  1.39  christos 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
    762  1.39  christos 		/*
    763  1.39  christos 		 * Halt parsing only if we have an empty branch and p_branch_empty
    764  1.39  christos 		 * indicates that we must not continue. In the future, this will not
    765  1.39  christos 		 * necessarily be an error.
    766  1.39  christos 		 */
    767  1.39  christos 		return (false);
    768  1.39  christos 	p_branch_ins_offset(p, bc);
    769  1.39  christos 
    770  1.39  christos 	return (true);
    771  1.39  christos }
    772  1.30  christos 
    773  1.39  christos static void
    774  1.39  christos p_bre_pre_parse(struct parse *p, struct branchc *bc)
    775  1.39  christos {
    776  1.14     lukem 
    777  1.40  christos 	(void)bc;
    778  1.39  christos 	/*
    779  1.39  christos 	 * Does not move cleanly into expression parser because of
    780  1.39  christos 	 * ordinary interpration of * at the beginning position of
    781  1.39  christos 	 * an expression.
    782  1.39  christos 	 */
    783   1.1       jtc 	if (EAT('^')) {
    784   1.1       jtc 		EMIT(OBOL, 0);
    785   1.1       jtc 		p->g->iflags |= USEBOL;
    786   1.1       jtc 		p->g->nbol++;
    787   1.1       jtc 	}
    788  1.39  christos }
    789  1.39  christos 
    790  1.39  christos static void
    791  1.39  christos p_bre_post_parse(struct parse *p, struct branchc *bc)
    792  1.39  christos {
    793  1.39  christos 
    794  1.39  christos 	/* Expression is terminating due to EOL token */
    795  1.39  christos 	if (bc->terminate) {
    796   1.1       jtc 		DROP(1);
    797   1.1       jtc 		EMIT(OEOL, 0);
    798   1.1       jtc 		p->g->iflags |= USEEOL;
    799   1.1       jtc 		p->g->neol++;
    800   1.1       jtc 	}
    801  1.39  christos }
    802   1.1       jtc 
    803  1.39  christos /*
    804  1.39  christos  - p_re - Top level parser, concatenation and BRE anchoring
    805  1.39  christos  == static void p_re(struct parse *p, int end1, int end2);
    806  1.39  christos  * Giving end1 as OUT essentially eliminates the end1/end2 check.
    807  1.39  christos  *
    808  1.39  christos  * This implementation is a bit of a kludge, in that a trailing $ is first
    809  1.39  christos  * taken as an ordinary character and then revised to be an anchor.
    810  1.39  christos  * The amount of lookahead needed to avoid this kludge is excessive.
    811  1.39  christos  */
    812  1.39  christos static void
    813  1.39  christos p_re(struct parse *p,
    814  1.39  christos 	int end1,	/* first terminating character */
    815  1.39  christos 	int end2)	/* second terminating character; ignored for EREs */
    816  1.39  christos {
    817  1.39  christos 	struct branchc bc;
    818  1.39  christos 
    819  1.39  christos 	bc.nbranch = 0;
    820  1.39  christos 	if (end1 == OUT && end2 == OUT)
    821  1.39  christos 		bc.outer = true;
    822  1.39  christos 	else
    823  1.39  christos 		bc.outer = false;
    824  1.39  christos #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
    825  1.39  christos 	for (;;) {
    826  1.39  christos 		bc.start = HERE();
    827  1.39  christos 		bc.nchain = 0;
    828  1.39  christos 		bc.terminate = false;
    829  1.39  christos 		if (p->pre_parse != NULL)
    830  1.39  christos 			p->pre_parse(p, &bc);
    831  1.39  christos 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
    832  1.39  christos 			bc.terminate = p->parse_expr(p, &bc);
    833  1.39  christos 			++bc.nchain;
    834  1.39  christos 		}
    835  1.39  christos 		if (p->post_parse != NULL)
    836  1.39  christos 			p->post_parse(p, &bc);
    837  1.39  christos 		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
    838  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    839  1.39  christos 		if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
    840  1.39  christos 			break;
    841  1.39  christos #endif
    842  1.39  christos 		if (!p->allowbranch)
    843  1.39  christos 			break;
    844  1.39  christos 		/*
    845  1.39  christos 		 * p_branch_do's return value indicates whether we should
    846  1.39  christos 		 * continue parsing or not. This is both for correctness and
    847  1.39  christos 		 * a slight optimization, because it will check if we've
    848  1.39  christos 		 * encountered an empty branch or the end of the string
    849  1.39  christos 		 * immediately following a branch delimiter.
    850  1.39  christos 		 */
    851  1.39  christos 		if (!p_branch_do(p, &bc))
    852  1.39  christos 			break;
    853  1.39  christos 	}
    854  1.39  christos #undef SEE_END
    855  1.39  christos 	if (p->allowbranch)
    856  1.39  christos 		p_branch_fix_tail(p, &bc);
    857  1.39  christos 	assert(!MORE() || SEE(end1));
    858   1.1       jtc }
    859   1.1       jtc 
    860   1.1       jtc /*
    861   1.1       jtc  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
    862  1.39  christos  == static bool p_simp_re(struct parse *p, struct branchc *bc);
    863   1.1       jtc  */
    864  1.39  christos static bool			/* was the simple RE an unbackslashed $? */
    865  1.39  christos p_simp_re(struct parse *p, struct branchc *bc)
    866   1.1       jtc {
    867   1.9     perry 	int c;
    868  1.39  christos 	int cc;			/* convenient/control character */
    869   1.9     perry 	int count;
    870   1.9     perry 	int count2;
    871  1.39  christos 	sopno pos;
    872  1.39  christos 	bool handled;
    873  1.40  christos 	size_t i;
    874  1.39  christos 	wint_t wc;
    875   1.9     perry 	sopno subno;
    876   1.1       jtc #	define	BACKSL	(1<<CHAR_BIT)
    877   1.1       jtc 
    878  1.39  christos 	pos = HERE();		/* repetition op, if any, covers from here */
    879  1.39  christos 	handled = false;
    880   1.1       jtc 
    881   1.1       jtc 	assert(MORE());		/* caller should have ensured this */
    882   1.1       jtc 	c = GETNEXT();
    883   1.1       jtc 	if (c == '\\') {
    884  1.39  christos 		(void)REQUIRE(MORE(), REG_EESCAPE);
    885  1.39  christos 		cc = GETNEXT();
    886  1.39  christos 		c = BACKSL | cc;
    887  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
    888  1.39  christos 		if (p->gnuext) {
    889  1.39  christos 			handled = true;
    890  1.39  christos 			switch (c) {
    891  1.39  christos 			case BACKSL|'`':
    892  1.39  christos 				EMIT(OBOS, 0);
    893  1.39  christos 				break;
    894  1.39  christos 			case BACKSL|'\'':
    895  1.39  christos 				EMIT(OEOS, 0);
    896  1.39  christos 				break;
    897  1.39  christos 			case BACKSL|'B':
    898  1.39  christos 				EMIT(ONWBND, 0);
    899  1.39  christos 				break;
    900  1.39  christos 			case BACKSL|'b':
    901  1.39  christos 				EMIT(OWBND, 0);
    902  1.39  christos 				break;
    903  1.39  christos 			case BACKSL|'W':
    904  1.39  christos 			case BACKSL|'w':
    905  1.39  christos 			case BACKSL|'S':
    906  1.39  christos 			case BACKSL|'s':
    907  1.39  christos 				p_b_pseudoclass(p, cc);
    908  1.39  christos 				break;
    909  1.39  christos 			default:
    910  1.39  christos 				handled = false;
    911  1.39  christos 			}
    912  1.39  christos 		}
    913  1.39  christos #endif
    914   1.1       jtc 	}
    915  1.39  christos 	if (!handled) {
    916  1.39  christos 		switch (c) {
    917  1.39  christos 		case '.':
    918  1.39  christos 			if (p->g->cflags&REG_NEWLINE)
    919  1.39  christos 				nonnewline(p);
    920  1.39  christos 			else
    921  1.39  christos 				EMIT(OANY, 0);
    922  1.39  christos 			break;
    923  1.39  christos 		case '[':
    924  1.39  christos 			p_bracket(p);
    925  1.39  christos 			break;
    926  1.39  christos 		case BACKSL|'<':
    927  1.39  christos 			EMIT(OBOW, 0);
    928  1.39  christos 			break;
    929  1.39  christos 		case BACKSL|'>':
    930  1.39  christos 			EMIT(OEOW, 0);
    931  1.39  christos 			break;
    932  1.39  christos 		case BACKSL|'{':
    933  1.39  christos 			SETERROR(REG_BADRPT);
    934  1.39  christos 			break;
    935  1.39  christos 		case BACKSL|'(':
    936  1.39  christos 			p->g->nsub++;
    937  1.40  christos 			subno = (sopno)p->g->nsub;
    938  1.39  christos 			if (subno < NPAREN)
    939  1.39  christos 				p->pbegin[subno] = HERE();
    940  1.39  christos 			EMIT(OLPAREN, subno);
    941  1.39  christos 			/* the MORE here is an error heuristic */
    942  1.39  christos 			if (MORE() && !SEETWO('\\', ')'))
    943  1.39  christos 				p_re(p, '\\', ')');
    944  1.39  christos 			if (subno < NPAREN) {
    945  1.39  christos 				p->pend[subno] = HERE();
    946  1.39  christos 				assert(p->pend[subno] != 0);
    947  1.39  christos 			}
    948  1.39  christos 			EMIT(ORPAREN, subno);
    949  1.39  christos 			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
    950  1.39  christos 			break;
    951  1.39  christos 		case BACKSL|')':	/* should not get here -- must be user */
    952  1.39  christos 			SETERROR(REG_EPAREN);
    953  1.39  christos 			break;
    954  1.39  christos 		case BACKSL|'1':
    955  1.39  christos 		case BACKSL|'2':
    956  1.39  christos 		case BACKSL|'3':
    957  1.39  christos 		case BACKSL|'4':
    958  1.39  christos 		case BACKSL|'5':
    959  1.39  christos 		case BACKSL|'6':
    960  1.39  christos 		case BACKSL|'7':
    961  1.39  christos 		case BACKSL|'8':
    962  1.39  christos 		case BACKSL|'9':
    963  1.39  christos 			i = (c&~BACKSL) - '0';
    964  1.39  christos 			assert(i < NPAREN);
    965  1.39  christos 			if (p->pend[i] != 0) {
    966  1.39  christos 				assert(i <= p->g->nsub);
    967  1.39  christos 				EMIT(OBACK_, i);
    968  1.39  christos 				assert(p->pbegin[i] != 0);
    969  1.39  christos 				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
    970  1.39  christos 				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
    971  1.39  christos 				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
    972  1.39  christos 				EMIT(O_BACK, i);
    973  1.39  christos 			} else
    974  1.39  christos 				SETERROR(REG_ESUBREG);
    975  1.39  christos 			p->g->backrefs = 1;
    976  1.39  christos 			break;
    977  1.39  christos 		case '*':
    978  1.39  christos 			/*
    979  1.39  christos 			 * Ordinary if used as the first character beyond BOL anchor of
    980  1.39  christos 			 * a (sub-)expression, counts as a bad repetition operator if it
    981  1.39  christos 			 * appears otherwise.
    982  1.39  christos 			 */
    983  1.39  christos 			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
    984  1.39  christos 			/* FALLTHROUGH */
    985  1.39  christos 		default:
    986  1.39  christos 			if (p->error != 0)
    987  1.39  christos 				return (false);	/* Definitely not $... */
    988  1.39  christos 			p->next--;
    989  1.39  christos 			wc = WGETNEXT();
    990  1.39  christos 			if ((c & BACKSL) == 0 || may_escape(p, wc))
    991  1.39  christos 				ordinary(p, wc);
    992  1.39  christos 			else
    993  1.39  christos 				SETERROR(REG_EESCAPE);
    994  1.39  christos 			break;
    995   1.1       jtc 		}
    996   1.1       jtc 	}
    997   1.1       jtc 
    998   1.1       jtc 	if (EAT('*')) {		/* implemented as +? */
    999   1.4       jtc 		/* this case does not require the (y|) trick, noKLUDGE */
   1000   1.1       jtc 		INSERT(OPLUS_, pos);
   1001   1.1       jtc 		ASTERN(O_PLUS, pos);
   1002   1.1       jtc 		INSERT(OQUEST_, pos);
   1003   1.1       jtc 		ASTERN(O_QUEST, pos);
   1004  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
   1005  1.39  christos 	} else if (p->gnuext && EATTWO('\\', '?')) {
   1006  1.39  christos 		INSERT(OQUEST_, pos);
   1007  1.39  christos 		ASTERN(O_QUEST, pos);
   1008  1.39  christos 	} else if (p->gnuext && EATTWO('\\', '+')) {
   1009  1.39  christos 		INSERT(OPLUS_, pos);
   1010  1.39  christos 		ASTERN(O_PLUS, pos);
   1011  1.39  christos #endif
   1012   1.1       jtc 	} else if (EATTWO('\\', '{')) {
   1013   1.1       jtc 		count = p_count(p);
   1014   1.1       jtc 		if (EAT(',')) {
   1015  1.39  christos 			if (MORE() && isdigit((uch)PEEK())) {
   1016   1.1       jtc 				count2 = p_count(p);
   1017  1.39  christos 				(void)REQUIRE(count <= count2, REG_BADBR);
   1018   1.1       jtc 			} else		/* single number with comma */
   1019   1.1       jtc 				count2 = INFINITY;
   1020   1.1       jtc 		} else		/* just a single number */
   1021   1.1       jtc 			count2 = count;
   1022  1.39  christos 		repeat(p, pos, count, count2);
   1023   1.1       jtc 		if (!EATTWO('\\', '}')) {	/* error heuristics */
   1024   1.1       jtc 			while (MORE() && !SEETWO('\\', '}'))
   1025   1.1       jtc 				NEXT();
   1026  1.39  christos 			(void)REQUIRE(MORE(), REG_EBRACE);
   1027   1.1       jtc 			SETERROR(REG_BADBR);
   1028   1.1       jtc 		}
   1029  1.39  christos 	} else if (c == '$')     /* $ (but not \$) ends it */
   1030  1.39  christos 		return (true);
   1031   1.1       jtc 
   1032  1.39  christos 	return (false);
   1033   1.1       jtc }
   1034   1.1       jtc 
   1035   1.1       jtc /*
   1036   1.1       jtc  - p_count - parse a repetition count
   1037   1.9     perry  == static int p_count(struct parse *p);
   1038   1.1       jtc  */
   1039   1.1       jtc static int			/* the value */
   1040  1.39  christos p_count(struct parse *p)
   1041   1.1       jtc {
   1042   1.9     perry 	int count = 0;
   1043   1.9     perry 	int ndigits = 0;
   1044   1.1       jtc 
   1045  1.39  christos 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
   1046   1.1       jtc 		count = count*10 + (GETNEXT() - '0');
   1047   1.1       jtc 		ndigits++;
   1048   1.1       jtc 	}
   1049   1.1       jtc 
   1050  1.39  christos 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
   1051   1.1       jtc 	return(count);
   1052   1.1       jtc }
   1053   1.1       jtc 
   1054   1.1       jtc /*
   1055   1.1       jtc  - p_bracket - parse a bracketed character list
   1056   1.9     perry  == static void p_bracket(struct parse *p);
   1057   1.1       jtc  */
   1058   1.1       jtc static void
   1059  1.39  christos p_bracket(struct parse *p)
   1060   1.1       jtc {
   1061  1.14     lukem 	cset *cs;
   1062  1.39  christos 	wint_t ch;
   1063  1.14     lukem 
   1064   1.1       jtc 	/* Dept of Truly Sickening Special-Case Kludges */
   1065  1.39  christos 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
   1066   1.1       jtc 		EMIT(OBOW, 0);
   1067   1.1       jtc 		NEXTn(6);
   1068   1.1       jtc 		return;
   1069   1.1       jtc 	}
   1070  1.39  christos 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
   1071   1.1       jtc 		EMIT(OEOW, 0);
   1072   1.1       jtc 		NEXTn(6);
   1073   1.1       jtc 		return;
   1074   1.1       jtc 	}
   1075   1.1       jtc 
   1076  1.39  christos 	if ((cs = allocset(p)) == NULL)
   1077  1.39  christos 		return;
   1078  1.39  christos 
   1079  1.39  christos 	if (p->g->cflags&REG_ICASE)
   1080  1.39  christos 		cs->icase = 1;
   1081   1.1       jtc 	if (EAT('^'))
   1082  1.39  christos 		cs->invert = 1;
   1083   1.1       jtc 	if (EAT(']'))
   1084  1.39  christos 		CHadd(p, cs, ']');
   1085   1.1       jtc 	else if (EAT('-'))
   1086  1.39  christos 		CHadd(p, cs, '-');
   1087   1.1       jtc 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
   1088   1.1       jtc 		p_b_term(p, cs);
   1089   1.1       jtc 	if (EAT('-'))
   1090  1.39  christos 		CHadd(p, cs, '-');
   1091  1.39  christos 	(void)MUSTEAT(']', REG_EBRACK);
   1092   1.1       jtc 
   1093   1.1       jtc 	if (p->error != 0)	/* don't mess things up further */
   1094   1.1       jtc 		return;
   1095   1.1       jtc 
   1096  1.39  christos 	if (cs->invert && p->g->cflags&REG_NEWLINE)
   1097  1.39  christos 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
   1098   1.1       jtc 
   1099  1.39  christos 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
   1100  1.39  christos 		ordinary(p, ch);
   1101  1.39  christos 		freeset(p, cs);
   1102  1.39  christos 	} else
   1103  1.40  christos 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1104  1.39  christos }
   1105   1.1       jtc 
   1106  1.39  christos static int
   1107  1.39  christos p_range_cmp(wchar_t c1, wchar_t c2)
   1108  1.39  christos {
   1109  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1110  1.39  christos 	return __wcollate_range_cmp(c1, c2);
   1111  1.42  christos #elif defined(NLS)
   1112  1.39  christos 	/* Copied from libc/collate __wcollate_range_cmp */
   1113  1.39  christos 	wchar_t s1[2], s2[2];
   1114   1.1       jtc 
   1115  1.39  christos 	s1[0] = c1;
   1116  1.39  christos 	s1[1] = L'\0';
   1117  1.39  christos 	s2[0] = c2;
   1118  1.39  christos 	s2[1] = L'\0';
   1119  1.42  christos 	return wcscoll(s1, s2);
   1120  1.42  christos #else
   1121  1.42  christos 	char s1[2], s2[2];
   1122  1.42  christos 
   1123  1.42  christos 	s1[0] = (char)c1;
   1124  1.42  christos 	s1[1] = '\0';
   1125  1.42  christos 	s2[0] = (char)c2;
   1126  1.42  christos 	s2[1] = '\0';
   1127  1.42  christos 	return strcoll(s1, s2);
   1128  1.39  christos #endif
   1129   1.1       jtc }
   1130   1.1       jtc 
   1131   1.1       jtc /*
   1132   1.1       jtc  - p_b_term - parse one term of a bracketed character list
   1133   1.9     perry  == static void p_b_term(struct parse *p, cset *cs);
   1134   1.1       jtc  */
   1135   1.1       jtc static void
   1136  1.39  christos p_b_term(struct parse *p, cset *cs)
   1137   1.1       jtc {
   1138   1.9     perry 	char c;
   1139  1.39  christos 	wint_t start, finish;
   1140  1.39  christos 	wint_t i;
   1141  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1142  1.39  christos 	struct xlocale_collate *table =
   1143  1.39  christos 		(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
   1144  1.39  christos #endif
   1145   1.1       jtc 
   1146  1.14     lukem 	_DIAGASSERT(p != NULL);
   1147  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1148  1.14     lukem 
   1149   1.1       jtc 	/* classify what we've got */
   1150   1.1       jtc 	switch ((MORE()) ? PEEK() : '\0') {
   1151   1.1       jtc 	case '[':
   1152   1.1       jtc 		c = (MORE2()) ? PEEK2() : '\0';
   1153   1.1       jtc 		break;
   1154   1.1       jtc 	case '-':
   1155   1.1       jtc 		SETERROR(REG_ERANGE);
   1156   1.1       jtc 		return;			/* NOTE RETURN */
   1157   1.1       jtc 	default:
   1158   1.1       jtc 		c = '\0';
   1159   1.1       jtc 		break;
   1160   1.1       jtc 	}
   1161   1.1       jtc 
   1162   1.1       jtc 	switch (c) {
   1163   1.1       jtc 	case ':':		/* character class */
   1164   1.1       jtc 		NEXT2();
   1165  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1166   1.1       jtc 		c = PEEK();
   1167  1.39  christos 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
   1168   1.1       jtc 		p_b_cclass(p, cs);
   1169  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1170  1.39  christos 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
   1171   1.1       jtc 		break;
   1172   1.1       jtc 	case '=':		/* equivalence class */
   1173   1.1       jtc 		NEXT2();
   1174  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1175   1.1       jtc 		c = PEEK();
   1176  1.39  christos 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
   1177   1.1       jtc 		p_b_eclass(p, cs);
   1178  1.39  christos 		(void)REQUIRE(MORE(), REG_EBRACK);
   1179  1.39  christos 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
   1180   1.1       jtc 		break;
   1181   1.1       jtc 	default:		/* symbol, ordinary character, or range */
   1182   1.1       jtc 		start = p_b_symbol(p);
   1183   1.1       jtc 		if (SEE('-') && MORE2() && PEEK2() != ']') {
   1184   1.1       jtc 			/* range */
   1185   1.1       jtc 			NEXT();
   1186   1.1       jtc 			if (EAT('-'))
   1187   1.1       jtc 				finish = '-';
   1188   1.1       jtc 			else
   1189   1.1       jtc 				finish = p_b_symbol(p);
   1190   1.1       jtc 		} else
   1191   1.1       jtc 			finish = start;
   1192  1.39  christos 		if (start == finish)
   1193  1.39  christos 			CHadd(p, cs, start);
   1194  1.39  christos 		else {
   1195  1.39  christos #ifdef REGEX_LIBC_COLLATE
   1196  1.39  christos 			if (table->__collate_load_error || MB_CUR_MAX > 1) {
   1197  1.39  christos #else
   1198  1.39  christos 			if (MB_CUR_MAX > 1) {
   1199  1.39  christos #endif
   1200  1.39  christos 				(void)REQUIRE(start <= finish, REG_ERANGE);
   1201  1.39  christos 				CHaddrange(p, cs, start, finish);
   1202  1.39  christos 			} else {
   1203  1.39  christos 				(void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
   1204  1.39  christos 				for (i = 0; i <= UCHAR_MAX; i++) {
   1205  1.39  christos 					if (p_range_cmp(start, i) <= 0 &&
   1206  1.39  christos 					    p_range_cmp(i, finish) <= 0 )
   1207  1.39  christos 						CHadd(p, cs, i);
   1208  1.39  christos 				}
   1209  1.39  christos 			}
   1210  1.39  christos 		}
   1211  1.39  christos 		break;
   1212  1.39  christos 	}
   1213  1.39  christos }
   1214  1.39  christos 
   1215  1.39  christos #ifdef REGEX_GNU_EXTENSIONS
   1216  1.39  christos /*
   1217  1.39  christos  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
   1218  1.39  christos  == static int p_b_pseudoclass(struct parse *p, char c)
   1219  1.39  christos  */
   1220  1.39  christos static int
   1221  1.39  christos p_b_pseudoclass(struct parse *p, char c) {
   1222  1.39  christos 	cset *cs;
   1223  1.39  christos 
   1224  1.39  christos 	if ((cs = allocset(p)) == NULL)
   1225  1.39  christos 		return(0);
   1226  1.39  christos 
   1227  1.39  christos 	if (p->g->cflags&REG_ICASE)
   1228  1.39  christos 		cs->icase = 1;
   1229  1.39  christos 
   1230  1.39  christos 	switch (c) {
   1231  1.39  christos 	case 'W':
   1232  1.39  christos 		cs->invert = 1;
   1233  1.39  christos 		/* FALLTHROUGH */
   1234  1.39  christos 	case 'w':
   1235  1.39  christos 		p_b_cclass_named(p, cs, "alnum");
   1236   1.1       jtc 		break;
   1237  1.39  christos 	case 'S':
   1238  1.39  christos 		cs->invert = 1;
   1239  1.39  christos 		/* FALLTHROUGH */
   1240  1.39  christos 	case 's':
   1241  1.39  christos 		p_b_cclass_named(p, cs, "space");
   1242  1.39  christos 		break;
   1243  1.39  christos 	default:
   1244  1.39  christos 		return(0);
   1245   1.1       jtc 	}
   1246  1.39  christos 
   1247  1.40  christos 	EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1248  1.39  christos 	return(1);
   1249   1.1       jtc }
   1250  1.39  christos #endif
   1251   1.1       jtc 
   1252   1.1       jtc /*
   1253   1.1       jtc  - p_b_cclass - parse a character-class name and deal with it
   1254   1.9     perry  == static void p_b_cclass(struct parse *p, cset *cs);
   1255   1.1       jtc  */
   1256   1.1       jtc static void
   1257  1.39  christos p_b_cclass(struct parse *p, cset *cs)
   1258   1.1       jtc {
   1259  1.39  christos 	const char *sp = p->next;
   1260   1.9     perry 	size_t len;
   1261  1.39  christos 	char clname[16];
   1262   1.1       jtc 
   1263  1.39  christos 	while (MORE() && isalpha((uch)PEEK()))
   1264   1.1       jtc 		NEXT();
   1265   1.1       jtc 	len = p->next - sp;
   1266  1.39  christos 	if (len >= sizeof(clname) - 1) {
   1267   1.1       jtc 		SETERROR(REG_ECTYPE);
   1268   1.1       jtc 		return;
   1269   1.1       jtc 	}
   1270  1.39  christos 	memcpy(clname, sp, len);
   1271  1.39  christos 	clname[len] = '\0';
   1272  1.39  christos 
   1273  1.39  christos 	p_b_cclass_named(p, cs, clname);
   1274  1.39  christos }
   1275  1.42  christos 
   1276  1.39  christos /*
   1277  1.39  christos  - p_b_cclass_named - deal with a named character class
   1278  1.39  christos  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
   1279  1.39  christos  */
   1280  1.39  christos static void
   1281  1.39  christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
   1282  1.39  christos 	wctype_t wct;
   1283   1.1       jtc 
   1284  1.39  christos 	if ((wct = wctype(clname)) == 0) {
   1285  1.39  christos 		SETERROR(REG_ECTYPE);
   1286  1.39  christos 		return;
   1287  1.39  christos 	}
   1288  1.39  christos 	CHaddtype(p, cs, wct);
   1289   1.1       jtc }
   1290   1.1       jtc 
   1291   1.1       jtc /*
   1292   1.1       jtc  - p_b_eclass - parse an equivalence-class name and deal with it
   1293   1.9     perry  == static void p_b_eclass(struct parse *p, cset *cs);
   1294   1.1       jtc  *
   1295   1.1       jtc  * This implementation is incomplete. xxx
   1296   1.1       jtc  */
   1297   1.1       jtc static void
   1298  1.39  christos p_b_eclass(struct parse *p, cset *cs)
   1299   1.1       jtc {
   1300  1.39  christos 	wint_t c;
   1301   1.1       jtc 
   1302  1.14     lukem 	_DIAGASSERT(p != NULL);
   1303  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1304  1.14     lukem 
   1305   1.1       jtc 	c = p_b_coll_elem(p, '=');
   1306  1.39  christos 	CHadd(p, cs, c);
   1307   1.1       jtc }
   1308   1.1       jtc 
   1309   1.1       jtc /*
   1310   1.1       jtc  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
   1311  1.39  christos  == static wint_t p_b_symbol(struct parse *p);
   1312   1.1       jtc  */
   1313  1.39  christos static wint_t			/* value of symbol */
   1314  1.39  christos p_b_symbol(struct parse *p)
   1315   1.1       jtc {
   1316  1.39  christos 	wint_t value;
   1317   1.1       jtc 
   1318  1.14     lukem 	_DIAGASSERT(p != NULL);
   1319  1.14     lukem 
   1320  1.39  christos 	(void)REQUIRE(MORE(), REG_EBRACK);
   1321   1.1       jtc 	if (!EATTWO('[', '.'))
   1322  1.39  christos 		return(WGETNEXT());
   1323   1.1       jtc 
   1324   1.1       jtc 	/* collating symbol */
   1325   1.1       jtc 	value = p_b_coll_elem(p, '.');
   1326  1.39  christos 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
   1327   1.1       jtc 	return(value);
   1328   1.1       jtc }
   1329   1.1       jtc 
   1330   1.1       jtc /*
   1331   1.1       jtc  - p_b_coll_elem - parse a collating-element name and look it up
   1332  1.39  christos  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
   1333   1.1       jtc  */
   1334  1.39  christos static wint_t			/* value of collating element */
   1335  1.39  christos p_b_coll_elem(struct parse *p,
   1336  1.39  christos 	wint_t endc)		/* name ended by endc,']' */
   1337  1.39  christos {
   1338  1.39  christos 	const char *sp = p->next;
   1339  1.39  christos 	struct cname *cp;
   1340  1.42  christos 	size_t len;
   1341   1.1       jtc 
   1342  1.14     lukem 	_DIAGASSERT(p != NULL);
   1343  1.14     lukem 
   1344   1.1       jtc 	while (MORE() && !SEETWO(endc, ']'))
   1345   1.1       jtc 		NEXT();
   1346   1.1       jtc 	if (!MORE()) {
   1347   1.1       jtc 		SETERROR(REG_EBRACK);
   1348   1.1       jtc 		return(0);
   1349   1.1       jtc 	}
   1350   1.1       jtc 	len = p->next - sp;
   1351   1.1       jtc 	for (cp = cnames; cp->name != NULL; cp++)
   1352  1.37  christos 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
   1353   1.1       jtc 			return(cp->code);	/* known name */
   1354  1.42  christos #ifdef NLS
   1355  1.42  christos 	mbstate_t mbs;
   1356  1.42  christos 	wchar_t wc;
   1357  1.42  christos 	size_t clen;
   1358  1.42  christos 
   1359  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1360  1.39  christos 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
   1361  1.39  christos 		return (wc);			/* single character */
   1362  1.39  christos 	else if (clen == (size_t)-1 || clen == (size_t)-2)
   1363  1.39  christos 		SETERROR(REG_ILLSEQ);
   1364  1.39  christos 	else
   1365  1.39  christos 		SETERROR(REG_ECOLLATE);		/* neither */
   1366   1.1       jtc 	return(0);
   1367  1.42  christos #else
   1368  1.42  christos 	if (len == 1)
   1369  1.42  christos 		return *sp;    /* single character */
   1370  1.42  christos 	SETERROR(REG_ECOLLATE);                 /* neither */
   1371  1.42  christos 	return 0;
   1372  1.42  christos #endif
   1373   1.1       jtc }
   1374   1.1       jtc 
   1375   1.1       jtc /*
   1376  1.39  christos  - may_escape - determine whether 'ch' is escape-able in the current context
   1377  1.39  christos  == static int may_escape(struct parse *p, const wint_t ch)
   1378  1.39  christos  */
   1379  1.39  christos static bool
   1380  1.39  christos may_escape(struct parse *p, const wint_t ch)
   1381  1.39  christos {
   1382  1.39  christos 
   1383  1.39  christos 	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
   1384  1.39  christos 		return (true);
   1385  1.39  christos 	if (isalpha(ch) || ch == '\'' || ch == '`')
   1386  1.39  christos 		return (false);
   1387  1.39  christos 	return (true);
   1388  1.39  christos #ifdef NOTYET
   1389  1.39  christos 	/*
   1390  1.39  christos 	 * Build a whitelist of characters that may be escaped to produce an
   1391  1.39  christos 	 * ordinary in the current context. This assumes that these have not
   1392  1.39  christos 	 * been otherwise interpreted as a special character. Escaping an
   1393  1.39  christos 	 * ordinary character yields undefined results according to
   1394  1.39  christos 	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
   1395  1.39  christos 	 * advantage of this and use escaped ordinary characters to provide
   1396  1.39  christos 	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
   1397  1.39  christos 	 */
   1398  1.39  christos 	switch(ch) {
   1399  1.39  christos 	case '|':
   1400  1.39  christos 	case '+':
   1401  1.39  christos 	case '?':
   1402  1.39  christos 		/* The above characters may not be escaped in BREs */
   1403  1.39  christos 		if (!(p->g->cflags&REG_EXTENDED))
   1404  1.39  christos 			return (false);
   1405  1.39  christos 		/* Fallthrough */
   1406  1.39  christos 	case '(':
   1407  1.39  christos 	case ')':
   1408  1.39  christos 	case '{':
   1409  1.39  christos 	case '}':
   1410  1.39  christos 	case '.':
   1411  1.39  christos 	case '[':
   1412  1.39  christos 	case ']':
   1413  1.39  christos 	case '\\':
   1414  1.39  christos 	case '*':
   1415  1.39  christos 	case '^':
   1416  1.39  christos 	case '$':
   1417  1.39  christos 		return (true);
   1418  1.39  christos 	default:
   1419  1.39  christos 		return (false);
   1420  1.39  christos 	}
   1421  1.39  christos #endif
   1422  1.39  christos }
   1423  1.39  christos 
   1424  1.39  christos /*
   1425   1.1       jtc  - othercase - return the case counterpart of an alphabetic
   1426  1.39  christos  == static wint_t othercase(wint_t ch);
   1427   1.1       jtc  */
   1428  1.39  christos static wint_t			/* if no counterpart, return ch */
   1429  1.39  christos othercase(wint_t ch)
   1430  1.39  christos {
   1431  1.39  christos 	assert(iswalpha(ch));
   1432  1.39  christos 	if (iswupper(ch))
   1433  1.39  christos 		return(towlower(ch));
   1434  1.39  christos 	else if (iswlower(ch))
   1435  1.39  christos 		return(towupper(ch));
   1436   1.1       jtc 	else			/* peculiar, but could happen */
   1437   1.1       jtc 		return(ch);
   1438   1.1       jtc }
   1439   1.1       jtc 
   1440   1.1       jtc /*
   1441   1.1       jtc  - bothcases - emit a dualcase version of a two-case character
   1442  1.39  christos  == static void bothcases(struct parse *p, wint_t ch);
   1443   1.1       jtc  *
   1444   1.1       jtc  * Boy, is this implementation ever a kludge...
   1445   1.1       jtc  */
   1446   1.1       jtc static void
   1447  1.39  christos bothcases(struct parse *p, wint_t ch)
   1448   1.1       jtc {
   1449  1.39  christos 	const char *oldnext = p->next;
   1450  1.39  christos 	const char *oldend = p->end;
   1451  1.39  christos 	char bracket[3 + MB_LEN_MAX];
   1452  1.39  christos 	size_t n;
   1453   1.1       jtc 
   1454  1.14     lukem 	_DIAGASSERT(p != NULL);
   1455  1.14     lukem 
   1456   1.1       jtc 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
   1457   1.1       jtc 	p->next = bracket;
   1458  1.42  christos #ifdef NLS
   1459  1.42  christos 	mbstate_t mbs;
   1460  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1461  1.39  christos 	n = wcrtomb(bracket, ch, &mbs);
   1462  1.39  christos 	assert(n != (size_t)-1);
   1463  1.42  christos #else
   1464  1.42  christos 	n = 0;
   1465  1.42  christos 	bracket[n++] = ch;
   1466  1.42  christos #endif
   1467  1.39  christos 	bracket[n] = ']';
   1468  1.39  christos 	bracket[n + 1] = '\0';
   1469  1.39  christos 	p->end = bracket+n+1;
   1470   1.1       jtc 	p_bracket(p);
   1471  1.39  christos 	assert(p->next == p->end);
   1472   1.1       jtc 	p->next = oldnext;
   1473   1.1       jtc 	p->end = oldend;
   1474   1.1       jtc }
   1475   1.1       jtc 
   1476   1.1       jtc /*
   1477   1.1       jtc  - ordinary - emit an ordinary character
   1478  1.39  christos  == static void ordinary(struct parse *p, wint_t ch);
   1479   1.1       jtc  */
   1480   1.1       jtc static void
   1481  1.39  christos ordinary(struct parse *p, wint_t ch)
   1482   1.1       jtc {
   1483  1.39  christos 	cset *cs;
   1484   1.1       jtc 
   1485  1.14     lukem 	_DIAGASSERT(p != NULL);
   1486  1.14     lukem 
   1487  1.39  christos 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
   1488  1.39  christos 		bothcases(p, ch);
   1489  1.40  christos 	else if ((wint_t)(ch & OPDMASK) == ch)
   1490  1.40  christos 		EMIT(OCHAR, (size_t)ch);
   1491   1.1       jtc 	else {
   1492  1.39  christos 		/*
   1493  1.39  christos 		 * Kludge: character is too big to fit into an OCHAR operand.
   1494  1.39  christos 		 * Emit a singleton set.
   1495  1.39  christos 		 */
   1496  1.39  christos 		if ((cs = allocset(p)) == NULL)
   1497  1.39  christos 			return;
   1498  1.39  christos 		CHadd(p, cs, ch);
   1499  1.40  christos 		EMIT(OANYOF, (size_t)(cs - p->g->sets));
   1500   1.1       jtc 	}
   1501   1.1       jtc }
   1502   1.1       jtc 
   1503   1.1       jtc /*
   1504   1.1       jtc  - nonnewline - emit REG_NEWLINE version of OANY
   1505   1.9     perry  == static void nonnewline(struct parse *p);
   1506   1.1       jtc  *
   1507   1.1       jtc  * Boy, is this implementation ever a kludge...
   1508   1.1       jtc  */
   1509   1.1       jtc static void
   1510  1.39  christos nonnewline(struct parse *p)
   1511   1.1       jtc {
   1512  1.39  christos 	const char *oldnext = p->next;
   1513  1.39  christos 	const char *oldend = p->end;
   1514   1.1       jtc 	char bracket[4];
   1515   1.1       jtc 
   1516  1.14     lukem 	_DIAGASSERT(p != NULL);
   1517  1.14     lukem 
   1518   1.1       jtc 	p->next = bracket;
   1519   1.1       jtc 	p->end = bracket+3;
   1520   1.1       jtc 	bracket[0] = '^';
   1521   1.1       jtc 	bracket[1] = '\n';
   1522   1.1       jtc 	bracket[2] = ']';
   1523   1.1       jtc 	bracket[3] = '\0';
   1524   1.1       jtc 	p_bracket(p);
   1525   1.1       jtc 	assert(p->next == bracket+3);
   1526   1.1       jtc 	p->next = oldnext;
   1527   1.1       jtc 	p->end = oldend;
   1528   1.1       jtc }
   1529   1.1       jtc 
   1530   1.1       jtc /*
   1531   1.1       jtc  - repeat - generate code for a bounded repetition, recursively if needed
   1532  1.39  christos  == static void repeat(struct parse *p, sopno start, int from, int to);
   1533   1.1       jtc  */
   1534   1.1       jtc static void
   1535  1.39  christos repeat(struct parse *p,
   1536  1.39  christos 	sopno start,		/* operand from here to end of strip */
   1537  1.39  christos 	int from,		/* repeated from this number */
   1538  1.39  christos 	int to)			/* to this number of times (maybe INFINITY) */
   1539   1.1       jtc {
   1540  1.39  christos 	sopno finish = HERE();
   1541   1.1       jtc #	define	N	2
   1542   1.1       jtc #	define	INF	3
   1543   1.1       jtc #	define	REP(f, t)	((f)*8 + (t))
   1544   1.1       jtc #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
   1545   1.9     perry 	sopno copy;
   1546   1.1       jtc 
   1547  1.14     lukem 	_DIAGASSERT(p != NULL);
   1548  1.14     lukem 
   1549  1.39  christos 	if (p->error != 0)	/* head off possible runaway recursion */
   1550  1.30  christos 		return;
   1551  1.30  christos 
   1552   1.1       jtc 	assert(from <= to);
   1553   1.1       jtc 
   1554   1.1       jtc 	switch (REP(MAP(from), MAP(to))) {
   1555   1.1       jtc 	case REP(0, 0):			/* must be user doing this */
   1556   1.1       jtc 		DROP(finish-start);	/* drop the operand */
   1557   1.1       jtc 		break;
   1558   1.1       jtc 	case REP(0, 1):			/* as x{1,1}? */
   1559   1.1       jtc 	case REP(0, N):			/* as x{1,n}? */
   1560   1.1       jtc 	case REP(0, INF):		/* as x{1,}? */
   1561   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
   1562   1.4       jtc 		INSERT(OCH_, start);		/* offset is wrong... */
   1563  1.39  christos 		repeat(p, start+1, 1, to);
   1564   1.4       jtc 		ASTERN(OOR1, start);
   1565   1.1       jtc 		AHEAD(start);			/* ... fix it */
   1566   1.4       jtc 		EMIT(OOR2, 0);
   1567   1.4       jtc 		AHEAD(THERE());
   1568   1.4       jtc 		ASTERN(O_CH, THERETHERE());
   1569   1.1       jtc 		break;
   1570   1.1       jtc 	case REP(1, 1):			/* trivial case */
   1571   1.1       jtc 		/* done */
   1572   1.1       jtc 		break;
   1573   1.1       jtc 	case REP(1, N):			/* as x?x{1,n-1} */
   1574   1.4       jtc 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
   1575   1.4       jtc 		INSERT(OCH_, start);
   1576   1.4       jtc 		ASTERN(OOR1, start);
   1577   1.4       jtc 		AHEAD(start);
   1578   1.4       jtc 		EMIT(OOR2, 0);			/* offset very wrong... */
   1579   1.4       jtc 		AHEAD(THERE());			/* ...so fix it */
   1580   1.4       jtc 		ASTERN(O_CH, THERETHERE());
   1581   1.1       jtc 		copy = dupl(p, start+1, finish+1);
   1582   1.4       jtc 		assert(copy == finish+4);
   1583  1.39  christos 		repeat(p, copy, 1, to-1);
   1584   1.1       jtc 		break;
   1585   1.1       jtc 	case REP(1, INF):		/* as x+ */
   1586   1.1       jtc 		INSERT(OPLUS_, start);
   1587   1.1       jtc 		ASTERN(O_PLUS, start);
   1588   1.1       jtc 		break;
   1589   1.1       jtc 	case REP(N, N):			/* as xx{m-1,n-1} */
   1590   1.1       jtc 		copy = dupl(p, start, finish);
   1591  1.39  christos 		repeat(p, copy, from-1, to-1);
   1592   1.1       jtc 		break;
   1593   1.1       jtc 	case REP(N, INF):		/* as xx{n-1,INF} */
   1594   1.1       jtc 		copy = dupl(p, start, finish);
   1595  1.39  christos 		repeat(p, copy, from-1, to);
   1596   1.1       jtc 		break;
   1597   1.1       jtc 	default:			/* "can't happen" */
   1598   1.1       jtc 		SETERROR(REG_ASSERT);	/* just in case */
   1599   1.1       jtc 		break;
   1600   1.1       jtc 	}
   1601   1.1       jtc }
   1602   1.1       jtc 
   1603   1.1       jtc /*
   1604  1.39  christos  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
   1605  1.39  christos  - character from the parse struct, signals a REG_ILLSEQ error if the
   1606  1.39  christos  - character can't be converted. Returns the number of bytes consumed.
   1607  1.39  christos  */
   1608  1.39  christos static wint_t
   1609  1.39  christos wgetnext(struct parse *p)
   1610  1.39  christos {
   1611  1.42  christos #ifdef NLS
   1612  1.39  christos 	mbstate_t mbs;
   1613  1.39  christos 	wchar_t wc;
   1614  1.39  christos 	size_t n;
   1615  1.39  christos 
   1616  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   1617  1.40  christos 	n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
   1618  1.39  christos 	if (n == (size_t)-1 || n == (size_t)-2) {
   1619  1.39  christos 		SETERROR(REG_ILLSEQ);
   1620  1.39  christos 		return (0);
   1621  1.39  christos 	}
   1622  1.39  christos 	if (n == 0)
   1623  1.39  christos 		n = 1;
   1624  1.39  christos 	p->next += n;
   1625  1.42  christos 	return wc;
   1626  1.42  christos #else
   1627  1.42  christos 	return *p->next++;
   1628  1.42  christos #endif
   1629  1.39  christos }
   1630  1.39  christos 
   1631  1.39  christos /*
   1632   1.1       jtc  - seterr - set an error condition
   1633   1.9     perry  == static int seterr(struct parse *p, int e);
   1634   1.1       jtc  */
   1635   1.1       jtc static int			/* useless but makes type checking happy */
   1636  1.39  christos seterr(struct parse *p, int e)
   1637   1.1       jtc {
   1638  1.14     lukem 
   1639  1.14     lukem 	_DIAGASSERT(p != NULL);
   1640  1.14     lukem 
   1641   1.1       jtc 	if (p->error == 0)	/* keep earliest error condition */
   1642   1.1       jtc 		p->error = e;
   1643   1.1       jtc 	p->next = nuls;		/* try to bring things to a halt */
   1644   1.1       jtc 	p->end = nuls;
   1645   1.1       jtc 	return(0);		/* make the return value well-defined */
   1646   1.1       jtc }
   1647   1.1       jtc 
   1648   1.1       jtc /*
   1649   1.1       jtc  - allocset - allocate a set of characters for []
   1650   1.9     perry  == static cset *allocset(struct parse *p);
   1651   1.1       jtc  */
   1652   1.1       jtc static cset *
   1653  1.39  christos allocset(struct parse *p)
   1654   1.1       jtc {
   1655  1.39  christos 	cset *cs, *ncs;
   1656   1.1       jtc 
   1657  1.14     lukem 	_DIAGASSERT(p != NULL);
   1658  1.14     lukem 
   1659  1.39  christos 	ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
   1660  1.39  christos 	if (ncs == NULL) {
   1661  1.39  christos 		SETERROR(REG_ESPACE);
   1662  1.39  christos 		return (NULL);
   1663   1.1       jtc 	}
   1664  1.39  christos 	p->g->sets = ncs;
   1665  1.39  christos 	cs = &p->g->sets[p->g->ncsets++];
   1666  1.39  christos 	memset(cs, 0, sizeof(*cs));
   1667   1.1       jtc 
   1668   1.1       jtc 	return(cs);
   1669   1.1       jtc }
   1670   1.1       jtc 
   1671   1.1       jtc /*
   1672   1.1       jtc  - freeset - free a now-unused set
   1673   1.9     perry  == static void freeset(struct parse *p, cset *cs);
   1674   1.1       jtc  */
   1675   1.1       jtc static void
   1676  1.39  christos freeset(struct parse *p, cset *cs)
   1677   1.1       jtc {
   1678  1.14     lukem 	cset *top;
   1679  1.14     lukem 
   1680  1.14     lukem 	_DIAGASSERT(p != NULL);
   1681  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1682  1.14     lukem 
   1683  1.14     lukem 	top = &p->g->sets[p->g->ncsets];
   1684   1.1       jtc 
   1685  1.39  christos 	free(cs->wides);
   1686  1.39  christos 	free(cs->ranges);
   1687  1.39  christos 	free(cs->types);
   1688  1.39  christos 	memset(cs, 0, sizeof(*cs));
   1689   1.1       jtc 	if (cs == top-1)	/* recover only the easy case */
   1690   1.1       jtc 		p->g->ncsets--;
   1691   1.1       jtc }
   1692   1.1       jtc 
   1693   1.1       jtc /*
   1694  1.39  christos  - singleton - Determine whether a set contains only one character,
   1695  1.39  christos  - returning it if so, otherwise returning OUT.
   1696   1.1       jtc  */
   1697  1.39  christos static wint_t
   1698  1.39  christos singleton(cset *cs)
   1699   1.1       jtc {
   1700  1.39  christos 	wint_t i, s, n;
   1701   1.1       jtc 
   1702  1.39  christos 	for (i = n = 0; i < NC; i++)
   1703  1.39  christos 		if (CHIN(cs, i)) {
   1704   1.1       jtc 			n++;
   1705  1.39  christos 			s = i;
   1706  1.39  christos 		}
   1707  1.39  christos 	if (n == 1)
   1708  1.39  christos 		return (s);
   1709  1.39  christos 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
   1710  1.39  christos 	    cs->icase == 0)
   1711  1.39  christos 		return (cs->wides[0]);
   1712  1.39  christos 	/* Don't bother handling the other cases. */
   1713  1.39  christos 	return (OUT);
   1714   1.1       jtc }
   1715   1.1       jtc 
   1716   1.1       jtc /*
   1717  1.39  christos  - CHadd - add character to character set.
   1718   1.1       jtc  */
   1719   1.1       jtc static void
   1720  1.39  christos CHadd(struct parse *p, cset *cs, wint_t ch)
   1721   1.1       jtc {
   1722  1.39  christos 	wint_t nch, *newwides;
   1723  1.14     lukem 
   1724  1.14     lukem 	_DIAGASSERT(p != NULL);
   1725  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1726  1.14     lukem 
   1727  1.39  christos 	assert(ch >= 0);
   1728  1.39  christos 	if (ch < NC)
   1729  1.40  christos 		cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
   1730  1.39  christos 	else {
   1731  1.39  christos 		newwides = reallocarray(cs->wides, cs->nwides + 1,
   1732  1.39  christos 		    sizeof(*cs->wides));
   1733  1.39  christos 		if (newwides == NULL) {
   1734  1.39  christos 			SETERROR(REG_ESPACE);
   1735  1.39  christos 			return;
   1736  1.39  christos 		}
   1737  1.39  christos 		cs->wides = newwides;
   1738  1.39  christos 		cs->wides[cs->nwides++] = ch;
   1739  1.39  christos 	}
   1740  1.39  christos 	if (cs->icase) {
   1741  1.39  christos 		if ((nch = towlower(ch)) < NC)
   1742  1.40  christos 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
   1743  1.39  christos 		if ((nch = towupper(ch)) < NC)
   1744  1.40  christos 			cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
   1745   1.1       jtc 	}
   1746   1.1       jtc }
   1747   1.1       jtc 
   1748   1.1       jtc /*
   1749  1.39  christos  - CHaddrange - add all characters in the range [min,max] to a character set.
   1750   1.1       jtc  */
   1751   1.1       jtc static void
   1752  1.39  christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
   1753   1.1       jtc {
   1754  1.39  christos 	crange *newranges;
   1755  1.14     lukem 
   1756  1.39  christos 	_DIAGASSERT(p != NULL);
   1757  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1758  1.14     lukem 
   1759  1.39  christos 	for (; min < NC && min <= max; min++)
   1760  1.39  christos 		CHadd(p, cs, min);
   1761  1.39  christos 	if (min >= max)
   1762  1.39  christos 		return;
   1763  1.39  christos 	newranges = reallocarray(cs->ranges, cs->nranges + 1,
   1764  1.39  christos 	    sizeof(*cs->ranges));
   1765  1.39  christos 	if (newranges == NULL) {
   1766  1.39  christos 		SETERROR(REG_ESPACE);
   1767   1.1       jtc 		return;
   1768   1.1       jtc 	}
   1769  1.39  christos 	cs->ranges = newranges;
   1770  1.39  christos 	cs->ranges[cs->nranges].min = min;
   1771  1.39  christos 	cs->ranges[cs->nranges].max = max;
   1772  1.39  christos 	cs->nranges++;
   1773   1.1       jtc }
   1774   1.1       jtc 
   1775   1.1       jtc /*
   1776  1.39  christos  - CHaddtype - add all characters of a certain type to a character set.
   1777   1.1       jtc  */
   1778   1.1       jtc static void
   1779  1.39  christos CHaddtype(struct parse *p, cset *cs, wctype_t wct)
   1780   1.1       jtc {
   1781  1.39  christos 	wint_t i;
   1782  1.39  christos 	wctype_t *newtypes;
   1783  1.14     lukem 
   1784  1.14     lukem 	_DIAGASSERT(p != NULL);
   1785  1.14     lukem 	_DIAGASSERT(cs != NULL);
   1786  1.14     lukem 
   1787  1.39  christos 	for (i = 0; i < NC; i++)
   1788  1.39  christos 		if (iswctype(i, wct))
   1789  1.39  christos 			CHadd(p, cs, i);
   1790  1.39  christos 	newtypes = reallocarray(cs->types, cs->ntypes + 1,
   1791  1.39  christos 	    sizeof(*cs->types));
   1792  1.39  christos 	if (newtypes == NULL) {
   1793  1.39  christos 		SETERROR(REG_ESPACE);
   1794   1.1       jtc 		return;
   1795  1.39  christos 	}
   1796  1.39  christos 	cs->types = newtypes;
   1797  1.39  christos 	cs->types[cs->ntypes++] = wct;
   1798   1.1       jtc }
   1799   1.1       jtc 
   1800   1.1       jtc /*
   1801   1.1       jtc  - dupl - emit a duplicate of a bunch of sops
   1802   1.9     perry  == static sopno dupl(struct parse *p, sopno start, sopno finish);
   1803   1.1       jtc  */
   1804   1.1       jtc static sopno			/* start of duplicate */
   1805  1.39  christos dupl(struct parse *p,
   1806  1.39  christos 	sopno start,		/* from here */
   1807  1.39  christos 	sopno finish)		/* to this less one */
   1808   1.1       jtc {
   1809  1.39  christos 	sopno ret = HERE();
   1810   1.9     perry 	sopno len = finish - start;
   1811   1.1       jtc 
   1812  1.14     lukem 	_DIAGASSERT(p != NULL);
   1813  1.14     lukem 
   1814   1.1       jtc 	assert(finish >= start);
   1815   1.1       jtc 	if (len == 0)
   1816   1.1       jtc 		return(ret);
   1817  1.39  christos 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
   1818  1.39  christos 		return(ret);
   1819  1.40  christos 	(void) memcpy(p->strip + p->slen,
   1820  1.40  christos 	    p->strip + start, len * sizeof(*p->strip));
   1821   1.1       jtc 	p->slen += len;
   1822   1.1       jtc 	return(ret);
   1823   1.1       jtc }
   1824   1.1       jtc 
   1825   1.1       jtc /*
   1826   1.1       jtc  - doemit - emit a strip operator
   1827   1.9     perry  == static void doemit(struct parse *p, sop op, size_t opnd);
   1828   1.1       jtc  *
   1829   1.1       jtc  * It might seem better to implement this as a macro with a function as
   1830   1.1       jtc  * hard-case backup, but it's just too big and messy unless there are
   1831   1.1       jtc  * some changes to the data structures.  Maybe later.
   1832   1.1       jtc  */
   1833   1.1       jtc static void
   1834  1.39  christos doemit(struct parse *p, sop op, size_t opnd)
   1835   1.1       jtc {
   1836   1.1       jtc 	/* avoid making error situations worse */
   1837   1.1       jtc 	if (p->error != 0)
   1838   1.1       jtc 		return;
   1839   1.1       jtc 
   1840  1.39  christos 	_DIAGASSERT(p != NULL);
   1841  1.39  christos 
   1842   1.1       jtc 	/* deal with oversize operands ("can't happen", more or less) */
   1843   1.1       jtc 	assert(opnd < 1<<OPSHIFT);
   1844   1.1       jtc 
   1845   1.1       jtc 	/* deal with undersized strip */
   1846   1.1       jtc 	if (p->slen >= p->ssize)
   1847  1.30  christos 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
   1848  1.30  christos 			return;
   1849   1.1       jtc 
   1850   1.1       jtc 	/* finally, it's all reduced to the easy case */
   1851  1.40  christos 	p->strip[p->slen++] = (sopno)SOP(op, opnd);
   1852   1.1       jtc }
   1853   1.1       jtc 
   1854   1.1       jtc /*
   1855   1.1       jtc  - doinsert - insert a sop into the strip
   1856   1.9     perry  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
   1857   1.1       jtc  */
   1858   1.1       jtc static void
   1859  1.39  christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
   1860   1.1       jtc {
   1861   1.9     perry 	sopno sn;
   1862   1.9     perry 	sop s;
   1863   1.9     perry 	int i;
   1864   1.1       jtc 
   1865  1.14     lukem 	_DIAGASSERT(p != NULL);
   1866  1.14     lukem 
   1867   1.1       jtc 	/* avoid making error situations worse */
   1868   1.1       jtc 	if (p->error != 0)
   1869   1.1       jtc 		return;
   1870   1.1       jtc 
   1871   1.1       jtc 	sn = HERE();
   1872   1.1       jtc 	EMIT(op, opnd);		/* do checks, ensure space */
   1873   1.1       jtc 	assert(HERE() == sn+1);
   1874   1.1       jtc 	s = p->strip[sn];
   1875   1.1       jtc 
   1876   1.1       jtc 	/* adjust paren pointers */
   1877   1.1       jtc 	assert(pos > 0);
   1878   1.1       jtc 	for (i = 1; i < NPAREN; i++) {
   1879   1.1       jtc 		if (p->pbegin[i] >= pos) {
   1880   1.1       jtc 			p->pbegin[i]++;
   1881   1.1       jtc 		}
   1882   1.1       jtc 		if (p->pend[i] >= pos) {
   1883   1.1       jtc 			p->pend[i]++;
   1884   1.1       jtc 		}
   1885   1.1       jtc 	}
   1886   1.1       jtc 
   1887  1.40  christos 	memmove(&p->strip[pos+1], &p->strip[pos],
   1888  1.40  christos 	    (HERE()-pos-1)*sizeof(*p->strip));
   1889   1.1       jtc 	p->strip[pos] = s;
   1890   1.1       jtc }
   1891   1.1       jtc 
   1892   1.1       jtc /*
   1893   1.1       jtc  - dofwd - complete a forward reference
   1894   1.9     perry  == static void dofwd(struct parse *p, sopno pos, sop value);
   1895   1.1       jtc  */
   1896   1.1       jtc static void
   1897  1.39  christos dofwd(struct parse *p, sopno pos, sop value)
   1898   1.1       jtc {
   1899  1.14     lukem 
   1900  1.14     lukem 	_DIAGASSERT(p != NULL);
   1901  1.14     lukem 
   1902   1.1       jtc 	/* avoid making error situations worse */
   1903   1.1       jtc 	if (p->error != 0)
   1904   1.1       jtc 		return;
   1905   1.1       jtc 
   1906   1.1       jtc 	assert(value < 1<<OPSHIFT);
   1907  1.39  christos 	p->strip[pos] = OP(p->strip[pos]) | value;
   1908   1.1       jtc }
   1909   1.1       jtc 
   1910   1.1       jtc /*
   1911   1.1       jtc  - enlarge - enlarge the strip
   1912  1.39  christos  == static int enlarge(struct parse *p, sopno size);
   1913   1.1       jtc  */
   1914  1.30  christos static int
   1915  1.35     joerg enlarge(struct parse *p, sopno size)
   1916   1.1       jtc {
   1917  1.39  christos 	sop *sp;
   1918  1.39  christos 
   1919  1.14     lukem 	_DIAGASSERT(p != NULL);
   1920  1.14     lukem 
   1921   1.1       jtc 	if (p->ssize >= size)
   1922  1.30  christos 		return 1;
   1923   1.1       jtc 
   1924  1.40  christos 	sp = reallocarray(p->strip, size, sizeof(*p->strip));
   1925  1.39  christos 	if (sp == NULL) {
   1926   1.1       jtc 		SETERROR(REG_ESPACE);
   1927  1.30  christos 		return 0;
   1928   1.1       jtc 	}
   1929  1.39  christos 	p->strip = sp;
   1930  1.35     joerg 	p->ssize = size;
   1931  1.30  christos 	return 1;
   1932   1.1       jtc }
   1933   1.1       jtc 
   1934   1.1       jtc /*
   1935   1.1       jtc  - stripsnug - compact the strip
   1936   1.9     perry  == static void stripsnug(struct parse *p, struct re_guts *g);
   1937   1.1       jtc  */
   1938   1.1       jtc static void
   1939  1.39  christos stripsnug(struct parse *p, struct re_guts *g)
   1940   1.1       jtc {
   1941  1.14     lukem 
   1942  1.14     lukem 	_DIAGASSERT(p != NULL);
   1943  1.14     lukem 	_DIAGASSERT(g != NULL);
   1944  1.14     lukem 
   1945   1.1       jtc 	g->nstates = p->slen;
   1946  1.40  christos 	g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
   1947  1.39  christos 	if (g->strip == NULL) {
   1948  1.39  christos 		SETERROR(REG_ESPACE);
   1949  1.39  christos 		g->strip = p->strip;
   1950  1.39  christos 	}
   1951   1.1       jtc }
   1952   1.1       jtc 
   1953   1.1       jtc /*
   1954   1.1       jtc  - findmust - fill in must and mlen with longest mandatory literal string
   1955   1.9     perry  == static void findmust(struct parse *p, struct re_guts *g);
   1956   1.1       jtc  *
   1957   1.1       jtc  * This algorithm could do fancy things like analyzing the operands of |
   1958   1.1       jtc  * for common subsequences.  Someday.  This code is simple and finds most
   1959   1.1       jtc  * of the interesting cases.
   1960   1.1       jtc  *
   1961   1.1       jtc  * Note that must and mlen got initialized during setup.
   1962   1.1       jtc  */
   1963   1.1       jtc static void
   1964  1.39  christos findmust(struct parse *p, struct re_guts *g)
   1965   1.1       jtc {
   1966   1.9     perry 	sop *scan;
   1967   1.7  christos 	sop *start = NULL;
   1968   1.9     perry 	sop *newstart = NULL;
   1969   1.9     perry 	sopno newlen;
   1970   1.9     perry 	sop s;
   1971   1.9     perry 	char *cp;
   1972  1.39  christos 	int offset;
   1973  1.39  christos 	mbstate_t mbs;
   1974   1.1       jtc 
   1975  1.14     lukem 	_DIAGASSERT(p != NULL);
   1976  1.14     lukem 	_DIAGASSERT(g != NULL);
   1977  1.14     lukem 
   1978   1.1       jtc 	/* avoid making error situations worse */
   1979   1.1       jtc 	if (p->error != 0)
   1980   1.1       jtc 		return;
   1981   1.1       jtc 
   1982  1.39  christos #ifdef notyet
   1983  1.39  christos 	/*
   1984  1.39  christos 	 * It's not generally safe to do a ``char'' substring search on
   1985  1.39  christos 	 * multibyte character strings, but it's safe for at least
   1986  1.39  christos 	 * UTF-8 (see RFC 3629).
   1987  1.39  christos 	 */
   1988  1.39  christos 	if (MB_CUR_MAX > 1 &&
   1989  1.39  christos 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
   1990  1.39  christos 		return;
   1991  1.39  christos #endif
   1992  1.39  christos 
   1993   1.1       jtc 	/* find the longest OCHAR sequence in strip */
   1994   1.1       jtc 	newlen = 0;
   1995  1.39  christos 	offset = 0;
   1996  1.39  christos 	g->moffset = 0;
   1997   1.1       jtc 	scan = g->strip + 1;
   1998   1.1       jtc 	do {
   1999   1.1       jtc 		s = *scan++;
   2000   1.1       jtc 		switch (OP(s)) {
   2001   1.1       jtc 		case OCHAR:		/* sequence member */
   2002  1.39  christos 			if (newlen == 0) {		/* new sequence */
   2003  1.39  christos 				memset(&mbs, 0, sizeof(mbs));
   2004   1.1       jtc 				newstart = scan - 1;
   2005  1.39  christos 			}
   2006  1.42  christos #ifdef NLS
   2007  1.42  christos 			char buf[MB_LEN_MAX];
   2008  1.42  christos 			size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
   2009  1.39  christos 			if (clen == (size_t)-1)
   2010  1.39  christos 				goto toohard;
   2011  1.40  christos 			newlen += (sopno)clen;
   2012  1.42  christos #else
   2013  1.42  christos 			newlen++;
   2014  1.42  christos #endif
   2015   1.1       jtc 			break;
   2016   1.1       jtc 		case OPLUS_:		/* things that don't break one */
   2017   1.1       jtc 		case OLPAREN:
   2018   1.1       jtc 		case ORPAREN:
   2019   1.1       jtc 			break;
   2020   1.1       jtc 		case OQUEST_:		/* things that must be skipped */
   2021   1.1       jtc 		case OCH_:
   2022  1.39  christos 			offset = altoffset(scan, offset);
   2023   1.1       jtc 			scan--;
   2024   1.1       jtc 			do {
   2025   1.1       jtc 				scan += OPND(s);
   2026   1.1       jtc 				s = *scan;
   2027   1.1       jtc 				/* assert() interferes w debug printouts */
   2028  1.40  christos 				if (OP(s) != O_QUEST &&
   2029  1.40  christos 				    OP(s) != O_CH && OP(s) != OOR2) {
   2030   1.1       jtc 					g->iflags |= BAD;
   2031   1.1       jtc 					return;
   2032   1.1       jtc 				}
   2033  1.40  christos 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
   2034  1.11  christos 			/* FALLTHROUGH */
   2035  1.39  christos 		case OBOW:		/* things that break a sequence */
   2036  1.39  christos 		case OEOW:
   2037  1.39  christos 		case OBOL:
   2038  1.39  christos 		case OEOL:
   2039  1.39  christos 		case OBOS:
   2040  1.39  christos 		case OEOS:
   2041  1.39  christos 		case OWBND:
   2042  1.39  christos 		case ONWBND:
   2043  1.39  christos 		case O_QUEST:
   2044  1.39  christos 		case O_CH:
   2045  1.39  christos 		case OEND:
   2046  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2047   1.1       jtc 				start = newstart;
   2048   1.1       jtc 				g->mlen = newlen;
   2049  1.39  christos 				if (offset > -1) {
   2050  1.39  christos 					g->moffset += offset;
   2051  1.39  christos 					offset = newlen;
   2052  1.39  christos 				} else
   2053  1.39  christos 					g->moffset = offset;
   2054  1.39  christos 			} else {
   2055  1.39  christos 				if (offset > -1)
   2056  1.39  christos 					offset += newlen;
   2057   1.1       jtc 			}
   2058   1.1       jtc 			newlen = 0;
   2059   1.1       jtc 			break;
   2060  1.39  christos 		case OANY:
   2061  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2062  1.39  christos 				start = newstart;
   2063  1.39  christos 				g->mlen = newlen;
   2064  1.39  christos 				if (offset > -1) {
   2065  1.39  christos 					g->moffset += offset;
   2066  1.39  christos 					offset = newlen;
   2067  1.39  christos 				} else
   2068  1.39  christos 					g->moffset = offset;
   2069  1.39  christos 			} else {
   2070  1.39  christos 				if (offset > -1)
   2071  1.39  christos 					offset += newlen;
   2072  1.39  christos 			}
   2073  1.39  christos 			if (offset > -1)
   2074  1.39  christos 				offset++;
   2075  1.39  christos 			newlen = 0;
   2076  1.39  christos 			break;
   2077  1.39  christos 		case OANYOF:		/* may or may not invalidate offset */
   2078  1.39  christos 			/* First, everything as OANY */
   2079  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2080  1.39  christos 				start = newstart;
   2081  1.39  christos 				g->mlen = newlen;
   2082  1.39  christos 				if (offset > -1) {
   2083  1.39  christos 					g->moffset += offset;
   2084  1.39  christos 					offset = newlen;
   2085  1.39  christos 				} else
   2086  1.39  christos 					g->moffset = offset;
   2087  1.39  christos 			} else {
   2088  1.39  christos 				if (offset > -1)
   2089  1.39  christos 					offset += newlen;
   2090  1.39  christos 			}
   2091  1.39  christos 			if (offset > -1)
   2092  1.39  christos 				offset++;
   2093  1.39  christos 			newlen = 0;
   2094  1.39  christos 			break;
   2095  1.42  christos #ifdef NLS
   2096  1.40  christos 		toohard:/*FALLTHROUGH*/
   2097  1.42  christos #endif
   2098  1.39  christos 		default:
   2099  1.39  christos 			/* Anything here makes it impossible or too hard
   2100  1.39  christos 			 * to calculate the offset -- so we give up;
   2101  1.39  christos 			 * save the last known good offset, in case the
   2102  1.39  christos 			 * must sequence doesn't occur later.
   2103  1.39  christos 			 */
   2104  1.39  christos 			if (newlen > (sopno)g->mlen) {		/* ends one */
   2105  1.39  christos 				start = newstart;
   2106  1.39  christos 				g->mlen = newlen;
   2107  1.39  christos 				if (offset > -1)
   2108  1.39  christos 					g->moffset += offset;
   2109  1.39  christos 				else
   2110  1.39  christos 					g->moffset = offset;
   2111  1.39  christos 			}
   2112  1.39  christos 			offset = -1;
   2113  1.39  christos 			newlen = 0;
   2114  1.39  christos 			break;
   2115   1.1       jtc 		}
   2116   1.1       jtc 	} while (OP(s) != OEND);
   2117   1.1       jtc 
   2118  1.39  christos 	if (g->mlen == 0) {		/* there isn't one */
   2119  1.39  christos 		g->moffset = -1;
   2120   1.1       jtc 		return;
   2121  1.39  christos 	}
   2122   1.1       jtc 
   2123   1.1       jtc 	/* turn it into a character string */
   2124   1.1       jtc 	g->must = malloc((size_t)g->mlen + 1);
   2125   1.1       jtc 	if (g->must == NULL) {		/* argh; just forget it */
   2126   1.1       jtc 		g->mlen = 0;
   2127  1.39  christos 		g->moffset = -1;
   2128   1.1       jtc 		return;
   2129   1.1       jtc 	}
   2130   1.1       jtc 	cp = g->must;
   2131   1.1       jtc 	scan = start;
   2132  1.39  christos 	memset(&mbs, 0, sizeof(mbs));
   2133  1.39  christos 	while (cp < g->must + g->mlen) {
   2134   1.1       jtc 		while (OP(s = *scan++) != OCHAR)
   2135   1.1       jtc 			continue;
   2136  1.42  christos #ifdef NLS
   2137  1.43  christos 		size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
   2138  1.39  christos 		assert(clen != (size_t)-1);
   2139  1.39  christos 		cp += clen;
   2140  1.42  christos #else
   2141  1.42  christos 		*cp++ = OPND(s);
   2142  1.42  christos #endif
   2143   1.1       jtc 	}
   2144   1.2       jtc 	assert(cp == g->must + g->mlen);
   2145   1.1       jtc 	*cp++ = '\0';		/* just on general principles */
   2146   1.1       jtc }
   2147   1.1       jtc 
   2148   1.1       jtc /*
   2149  1.39  christos  - altoffset - choose biggest offset among multiple choices
   2150  1.39  christos  == static int altoffset(sop *scan, int offset);
   2151  1.39  christos  *
   2152  1.39  christos  * Compute, recursively if necessary, the largest offset among multiple
   2153  1.39  christos  * re paths.
   2154  1.39  christos  */
   2155  1.39  christos static int
   2156  1.39  christos altoffset(sop *scan, int offset)
   2157  1.39  christos {
   2158  1.39  christos 	int largest;
   2159  1.39  christos 	int try;
   2160  1.39  christos 	sop s;
   2161  1.39  christos 
   2162  1.39  christos 	_DIAGASSERT(scan != NULL);
   2163  1.39  christos 
   2164  1.39  christos 	/* If we gave up already on offsets, return */
   2165  1.39  christos 	if (offset == -1)
   2166  1.39  christos 		return -1;
   2167  1.39  christos 
   2168  1.39  christos 	largest = 0;
   2169  1.39  christos 	try = 0;
   2170  1.39  christos 	s = *scan++;
   2171  1.40  christos 	while (OP(s) != O_QUEST && OP(s) != O_CH) {
   2172  1.39  christos 		switch (OP(s)) {
   2173  1.39  christos 		case OOR1:
   2174  1.39  christos 			if (try > largest)
   2175  1.39  christos 				largest = try;
   2176  1.39  christos 			try = 0;
   2177  1.39  christos 			break;
   2178  1.39  christos 		case OQUEST_:
   2179  1.39  christos 		case OCH_:
   2180  1.39  christos 			try = altoffset(scan, try);
   2181  1.39  christos 			if (try == -1)
   2182  1.39  christos 				return -1;
   2183  1.39  christos 			scan--;
   2184  1.39  christos 			do {
   2185  1.39  christos 				scan += OPND(s);
   2186  1.39  christos 				s = *scan;
   2187  1.40  christos 				if (OP(s) != O_QUEST &&
   2188  1.40  christos 				    OP(s) != O_CH && OP(s) != OOR2)
   2189  1.39  christos 					return -1;
   2190  1.40  christos 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
   2191  1.39  christos 			/* We must skip to the next position, or we'll
   2192  1.39  christos 			 * leave altoffset() too early.
   2193  1.39  christos 			 */
   2194  1.39  christos 			scan++;
   2195  1.39  christos 			break;
   2196  1.39  christos 		case OANYOF:
   2197  1.39  christos 		case OCHAR:
   2198  1.39  christos 		case OANY:
   2199  1.39  christos 			try++;
   2200  1.40  christos 			/*FALLTHROUGH*/
   2201  1.39  christos 		case OBOW:
   2202  1.39  christos 		case OEOW:
   2203  1.39  christos 		case OWBND:
   2204  1.39  christos 		case ONWBND:
   2205  1.39  christos 		case OLPAREN:
   2206  1.39  christos 		case ORPAREN:
   2207  1.39  christos 		case OOR2:
   2208  1.39  christos 			break;
   2209  1.39  christos 		default:
   2210  1.39  christos 			try = -1;
   2211  1.39  christos 			break;
   2212  1.39  christos 		}
   2213  1.39  christos 		if (try == -1)
   2214  1.39  christos 			return -1;
   2215  1.39  christos 		s = *scan++;
   2216  1.39  christos 	}
   2217  1.39  christos 
   2218  1.39  christos 	if (try > largest)
   2219  1.39  christos 		largest = try;
   2220  1.39  christos 
   2221  1.39  christos 	return largest+offset;
   2222  1.39  christos }
   2223  1.39  christos 
   2224  1.39  christos /*
   2225  1.39  christos  - computejumps - compute char jumps for BM scan
   2226  1.39  christos  == static void computejumps(struct parse *p, struct re_guts *g);
   2227  1.39  christos  *
   2228  1.39  christos  * This algorithm assumes g->must exists and is has size greater than
   2229  1.39  christos  * zero. It's based on the algorithm found on Computer Algorithms by
   2230  1.39  christos  * Sara Baase.
   2231  1.39  christos  *
   2232  1.39  christos  * A char jump is the number of characters one needs to jump based on
   2233  1.39  christos  * the value of the character from the text that was mismatched.
   2234  1.39  christos  */
   2235  1.39  christos static void
   2236  1.39  christos computejumps(struct parse *p, struct re_guts *g)
   2237  1.39  christos {
   2238  1.39  christos 	int ch;
   2239  1.39  christos 	size_t mindex;
   2240  1.39  christos 
   2241  1.39  christos 	_DIAGASSERT(p != NULL);
   2242  1.39  christos 	_DIAGASSERT(g != NULL);
   2243  1.39  christos 
   2244  1.39  christos 	/* Avoid making errors worse */
   2245  1.39  christos 	if (p->error != 0)
   2246  1.39  christos 		return;
   2247  1.39  christos 
   2248  1.39  christos 	g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
   2249  1.39  christos 	if (g->charjump == NULL)	/* Not a fatal error */
   2250  1.39  christos 		return;
   2251  1.39  christos 	/* Adjust for signed chars, if necessary */
   2252  1.39  christos 	g->charjump = &g->charjump[-(CHAR_MIN)];
   2253  1.39  christos 
   2254  1.39  christos 	/* If the character does not exist in the pattern, the jump
   2255  1.39  christos 	 * is equal to the number of characters in the pattern.
   2256  1.39  christos 	 */
   2257  1.39  christos 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
   2258  1.39  christos 		g->charjump[ch] = g->mlen;
   2259  1.39  christos 
   2260  1.39  christos 	/* If the character does exist, compute the jump that would
   2261  1.39  christos 	 * take us to the last character in the pattern equal to it
   2262  1.39  christos 	 * (notice that we match right to left, so that last character
   2263  1.39  christos 	 * is the first one that would be matched).
   2264  1.39  christos 	 */
   2265  1.39  christos 	for (mindex = 0; mindex < g->mlen; mindex++)
   2266  1.39  christos 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
   2267  1.39  christos }
   2268  1.39  christos 
   2269  1.39  christos /*
   2270  1.39  christos  - computematchjumps - compute match jumps for BM scan
   2271  1.39  christos  == static void computematchjumps(struct parse *p, struct re_guts *g);
   2272  1.39  christos  *
   2273  1.39  christos  * This algorithm assumes g->must exists and is has size greater than
   2274  1.39  christos  * zero. It's based on the algorithm found on Computer Algorithms by
   2275  1.39  christos  * Sara Baase.
   2276  1.39  christos  *
   2277  1.39  christos  * A match jump is the number of characters one needs to advance based
   2278  1.39  christos  * on the already-matched suffix.
   2279  1.39  christos  * Notice that all values here are minus (g->mlen-1), because of the way
   2280  1.39  christos  * the search algorithm works.
   2281  1.39  christos  */
   2282  1.39  christos static void
   2283  1.39  christos computematchjumps(struct parse *p, struct re_guts *g)
   2284  1.39  christos {
   2285  1.39  christos 	size_t mindex;		/* General "must" iterator */
   2286  1.39  christos 	size_t suffix;		/* Keeps track of matching suffix */
   2287  1.39  christos 	size_t ssuffix;		/* Keeps track of suffixes' suffix */
   2288  1.39  christos 	size_t* pmatches;	/* pmatches[k] points to the next i
   2289  1.39  christos 				 * such that i+1...mlen is a substring
   2290  1.39  christos 				 * of k+1...k+mlen-i-1
   2291  1.39  christos 				 */
   2292  1.39  christos 
   2293  1.39  christos 	_DIAGASSERT(p != NULL);
   2294  1.39  christos 	_DIAGASSERT(g != NULL);
   2295  1.39  christos 
   2296  1.39  christos 	/* Avoid making errors worse */
   2297  1.39  christos 	if (p->error != 0)
   2298  1.39  christos 		return;
   2299  1.39  christos 
   2300  1.39  christos 	pmatches = calloc(g->mlen, sizeof(*pmatches));
   2301  1.39  christos 	if (pmatches == NULL) {
   2302  1.39  christos 		g->matchjump = NULL;
   2303  1.39  christos 		return;
   2304  1.39  christos 	}
   2305  1.39  christos 
   2306  1.39  christos 	g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
   2307  1.39  christos 	if (g->matchjump == NULL) {	/* Not a fatal error */
   2308  1.39  christos 		free(pmatches);
   2309  1.39  christos 		return;
   2310  1.39  christos 	}
   2311  1.39  christos 
   2312  1.39  christos 	/* Set maximum possible jump for each character in the pattern */
   2313  1.39  christos 	for (mindex = 0; mindex < g->mlen; mindex++)
   2314  1.39  christos 		g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
   2315  1.39  christos 
   2316  1.39  christos 	/* Compute pmatches[] */
   2317  1.39  christos 	for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
   2318  1.39  christos 		pmatches[mindex] = suffix;
   2319  1.39  christos 
   2320  1.39  christos 		/* If a mismatch is found, interrupting the substring,
   2321  1.39  christos 		 * compute the matchjump for that position. If no
   2322  1.39  christos 		 * mismatch is found, then a text substring mismatched
   2323  1.39  christos 		 * against the suffix will also mismatch against the
   2324  1.39  christos 		 * substring.
   2325  1.39  christos 		 */
   2326  1.39  christos 		while (suffix < g->mlen
   2327  1.39  christos 		    && g->must[mindex] != g->must[suffix]) {
   2328  1.39  christos 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
   2329  1.39  christos 			    g->mlen - mindex - 1);
   2330  1.39  christos 			suffix = pmatches[suffix];
   2331  1.39  christos 		}
   2332  1.39  christos 	}
   2333  1.39  christos 
   2334  1.39  christos 	/* Compute the matchjump up to the last substring found to jump
   2335  1.39  christos 	 * to the beginning of the largest must pattern prefix matching
   2336  1.39  christos 	 * it's own suffix.
   2337  1.39  christos 	 */
   2338  1.39  christos 	for (mindex = 0; mindex <= suffix; mindex++)
   2339  1.39  christos 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
   2340  1.39  christos 		    g->mlen + suffix - mindex);
   2341  1.39  christos 
   2342  1.39  christos         ssuffix = pmatches[suffix];
   2343  1.39  christos         while (suffix < g->mlen) {
   2344  1.39  christos                 while (suffix <= ssuffix && suffix < g->mlen) {
   2345  1.39  christos                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
   2346  1.39  christos 			    g->mlen + ssuffix - suffix);
   2347  1.39  christos                         suffix++;
   2348  1.39  christos                 }
   2349  1.39  christos 		if (suffix < g->mlen)
   2350  1.39  christos                 	ssuffix = pmatches[ssuffix];
   2351  1.39  christos         }
   2352  1.39  christos 
   2353  1.39  christos 	free(pmatches);
   2354  1.39  christos }
   2355  1.39  christos 
   2356  1.39  christos /*
   2357   1.1       jtc  - pluscount - count + nesting
   2358   1.9     perry  == static sopno pluscount(struct parse *p, struct re_guts *g);
   2359   1.1       jtc  */
   2360   1.1       jtc static sopno			/* nesting depth */
   2361  1.39  christos pluscount(struct parse *p, struct re_guts *g)
   2362   1.1       jtc {
   2363   1.9     perry 	sop *scan;
   2364   1.9     perry 	sop s;
   2365   1.9     perry 	sopno plusnest = 0;
   2366   1.9     perry 	sopno maxnest = 0;
   2367  1.14     lukem 
   2368  1.14     lukem 	_DIAGASSERT(p != NULL);
   2369  1.14     lukem 	_DIAGASSERT(g != NULL);
   2370   1.1       jtc 
   2371   1.1       jtc 	if (p->error != 0)
   2372   1.1       jtc 		return(0);	/* there may not be an OEND */
   2373   1.1       jtc 
   2374   1.1       jtc 	scan = g->strip + 1;
   2375   1.1       jtc 	do {
   2376   1.1       jtc 		s = *scan++;
   2377   1.1       jtc 		switch (OP(s)) {
   2378   1.1       jtc 		case OPLUS_:
   2379   1.1       jtc 			plusnest++;
   2380   1.1       jtc 			break;
   2381   1.1       jtc 		case O_PLUS:
   2382   1.1       jtc 			if (plusnest > maxnest)
   2383   1.1       jtc 				maxnest = plusnest;
   2384   1.1       jtc 			plusnest--;
   2385   1.1       jtc 			break;
   2386   1.1       jtc 		}
   2387   1.1       jtc 	} while (OP(s) != OEND);
   2388   1.1       jtc 	if (plusnest != 0)
   2389   1.1       jtc 		g->iflags |= BAD;
   2390   1.1       jtc 	return(maxnest);
   2391   1.1       jtc }
   2392