1 1.49 christos /* $NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $ */ 2 1.6 cgd 3 1.5 cgd /*- 4 1.39 christos * SPDX-License-Identifier: BSD-3-Clause 5 1.39 christos * 6 1.39 christos * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 1.5 cgd * Copyright (c) 1992, 1993, 1994 8 1.5 cgd * The Regents of the University of California. All rights reserved. 9 1.5 cgd * 10 1.39 christos * Copyright (c) 2011 The FreeBSD Foundation 11 1.39 christos * All rights reserved. 12 1.39 christos * Portions of this software were developed by David Chisnall 13 1.39 christos * under sponsorship from the FreeBSD Foundation. 14 1.39 christos * 15 1.5 cgd * This code is derived from software contributed to Berkeley by 16 1.5 cgd * Henry Spencer. 17 1.5 cgd * 18 1.5 cgd * Redistribution and use in source and binary forms, with or without 19 1.5 cgd * modification, are permitted provided that the following conditions 20 1.5 cgd * are met: 21 1.5 cgd * 1. Redistributions of source code must retain the above copyright 22 1.5 cgd * notice, this list of conditions and the following disclaimer. 23 1.5 cgd * 2. Redistributions in binary form must reproduce the above copyright 24 1.5 cgd * notice, this list of conditions and the following disclaimer in the 25 1.5 cgd * documentation and/or other materials provided with the distribution. 26 1.18 agc * 3. Neither the name of the University nor the names of its contributors 27 1.18 agc * may be used to endorse or promote products derived from this software 28 1.18 agc * without specific prior written permission. 29 1.18 agc * 30 1.18 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 1.18 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 1.18 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 1.18 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 1.18 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 1.18 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 1.18 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 1.18 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 1.18 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 1.18 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 1.18 agc * SUCH DAMAGE. 41 1.18 agc * 42 1.18 agc * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 43 1.18 agc */ 44 1.18 agc 45 1.45 christos #if HAVE_NBTOOL_CONFIG_H 46 1.45 christos #include "nbtool_config.h" 47 1.45 christos #endif 48 1.45 christos 49 1.7 christos #include <sys/cdefs.h> 50 1.6 cgd #if 0 51 1.5 cgd static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 52 1.39 christos __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $"); 53 1.6 cgd #endif 54 1.49 christos __RCSID("$NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $"); 55 1.42 christos 56 1.42 christos #ifndef LIBHACK 57 1.39 christos #define REGEX_GNU_EXTENSIONS 58 1.5 cgd 59 1.8 jtc #include "namespace.h" 60 1.42 christos #endif 61 1.1 jtc #include <sys/types.h> 62 1.39 christos #include <stdio.h> 63 1.39 christos #include <string.h> 64 1.1 jtc #include <ctype.h> 65 1.1 jtc #include <limits.h> 66 1.1 jtc #include <stdlib.h> 67 1.28 junyoung #include <regex.h> 68 1.39 christos #include <stdbool.h> 69 1.8 jtc 70 1.44 christos #if defined(__weak_alias) && !defined(LIBHACK) 71 1.16 mycroft __weak_alias(regcomp,_regcomp) 72 1.8 jtc #endif 73 1.1 jtc 74 1.39 christos #ifdef REGEX_LIBC_COLLATE 75 1.39 christos #include "collate.h" 76 1.39 christos #endif 77 1.39 christos 78 1.1 jtc #include "utils.h" 79 1.1 jtc #include "regex2.h" 80 1.1 jtc 81 1.1 jtc #include "cname.h" 82 1.1 jtc 83 1.1 jtc /* 84 1.39 christos * Branching context, used to keep track of branch state for all of the branch- 85 1.39 christos * aware functions. In addition to keeping track of branch positions for the 86 1.39 christos * p_branch_* functions, we use this to simplify some clumsiness in BREs for 87 1.39 christos * detection of whether ^ is acting as an anchor or being used erroneously and 88 1.39 christos * also for whether we're in a sub-expression or not. 89 1.39 christos */ 90 1.39 christos struct branchc { 91 1.39 christos sopno start; 92 1.39 christos sopno back; 93 1.39 christos sopno fwd; 94 1.39 christos 95 1.39 christos int nbranch; 96 1.39 christos int nchain; 97 1.39 christos bool outer; 98 1.39 christos bool terminate; 99 1.39 christos }; 100 1.39 christos 101 1.39 christos /* 102 1.1 jtc * parse structure, passed up and down to avoid global variables and 103 1.1 jtc * other clumsinesses 104 1.1 jtc */ 105 1.1 jtc struct parse { 106 1.21 yamt const char *next; /* next character in RE */ 107 1.21 yamt const char *end; /* end of string (-> NUL normally) */ 108 1.1 jtc int error; /* has an error been seen? */ 109 1.39 christos int gnuext; 110 1.1 jtc sop *strip; /* malloced strip */ 111 1.1 jtc sopno ssize; /* malloced strip size (allocated) */ 112 1.1 jtc sopno slen; /* malloced strip length (used) */ 113 1.33 christos size_t ncsalloc; /* number of csets allocated */ 114 1.1 jtc struct re_guts *g; 115 1.1 jtc # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 116 1.1 jtc sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 117 1.1 jtc sopno pend[NPAREN]; /* -> ) ([0] unused) */ 118 1.39 christos bool allowbranch; /* can this expression branch? */ 119 1.39 christos bool bre; /* convenience; is this a BRE? */ 120 1.39 christos int pflags; /* other parsing flags -- legacy escapes? */ 121 1.39 christos bool (*parse_expr)(struct parse *, struct branchc *); 122 1.39 christos void (*pre_parse)(struct parse *, struct branchc *); 123 1.39 christos void (*post_parse)(struct parse *, struct branchc *); 124 1.1 jtc }; 125 1.1 jtc 126 1.39 christos #define PFLAG_LEGACY_ESC 0x00000001 127 1.39 christos 128 1.5 cgd /* ========= begin header generated by ./mkh ========= */ 129 1.5 cgd #ifdef __cplusplus 130 1.5 cgd extern "C" { 131 1.5 cgd #endif 132 1.5 cgd 133 1.5 cgd /* === regcomp.c === */ 134 1.39 christos static bool p_ere_exp(struct parse *p, struct branchc *bc); 135 1.26 junyoung static void p_str(struct parse *p); 136 1.39 christos static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 137 1.39 christos static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 138 1.39 christos static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 139 1.39 christos static bool p_branch_empty(struct parse *p, struct branchc *bc); 140 1.39 christos static bool p_branch_do(struct parse *p, struct branchc *bc); 141 1.39 christos static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 142 1.39 christos static void p_bre_post_parse(struct parse *p, struct branchc *bc); 143 1.39 christos static void p_re(struct parse *p, int end1, int end2); 144 1.39 christos static bool p_simp_re(struct parse *p, struct branchc *bc); 145 1.26 junyoung static int p_count(struct parse *p); 146 1.26 junyoung static void p_bracket(struct parse *p); 147 1.39 christos static int p_range_cmp(wchar_t c1, wchar_t c2); 148 1.26 junyoung static void p_b_term(struct parse *p, cset *cs); 149 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 150 1.39 christos static int p_b_pseudoclass(struct parse *p, char c); 151 1.39 christos #endif 152 1.26 junyoung static void p_b_cclass(struct parse *p, cset *cs); 153 1.39 christos static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 154 1.26 junyoung static void p_b_eclass(struct parse *p, cset *cs); 155 1.39 christos static wint_t p_b_symbol(struct parse *p); 156 1.39 christos static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 157 1.39 christos static bool may_escape(struct parse *p, const wint_t ch); 158 1.39 christos static wint_t othercase(wint_t ch); 159 1.39 christos static void bothcases(struct parse *p, wint_t ch); 160 1.39 christos static void ordinary(struct parse *p, wint_t ch); 161 1.26 junyoung static void nonnewline(struct parse *p); 162 1.39 christos static void repeat(struct parse *p, sopno start, int from, int to); 163 1.26 junyoung static int seterr(struct parse *p, int e); 164 1.26 junyoung static cset *allocset(struct parse *p); 165 1.26 junyoung static void freeset(struct parse *p, cset *cs); 166 1.39 christos static void CHadd(struct parse *p, cset *cs, wint_t ch); 167 1.39 christos static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 168 1.39 christos static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 169 1.39 christos static wint_t singleton(cset *cs); 170 1.26 junyoung static sopno dupl(struct parse *p, sopno start, sopno finish); 171 1.39 christos static void doemit(struct parse *p, sop op, size_t opnd); 172 1.39 christos static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 173 1.39 christos static void dofwd(struct parse *p, sopno pos, sop value); 174 1.30 christos static int enlarge(struct parse *p, sopno size); 175 1.26 junyoung static void stripsnug(struct parse *p, struct re_guts *g); 176 1.26 junyoung static void findmust(struct parse *p, struct re_guts *g); 177 1.39 christos static int altoffset(sop *scan, int offset); 178 1.39 christos static void computejumps(struct parse *p, struct re_guts *g); 179 1.39 christos static void computematchjumps(struct parse *p, struct re_guts *g); 180 1.26 junyoung static sopno pluscount(struct parse *p, struct re_guts *g); 181 1.39 christos static wint_t wgetnext(struct parse *p); 182 1.5 cgd 183 1.5 cgd #ifdef __cplusplus 184 1.5 cgd } 185 1.5 cgd #endif 186 1.5 cgd /* ========= end header generated by ./mkh ========= */ 187 1.1 jtc 188 1.1 jtc static char nuls[10]; /* place to point scanner in event of error */ 189 1.1 jtc 190 1.1 jtc /* 191 1.1 jtc * macros for use with parse structure 192 1.1 jtc * BEWARE: these know that the parse structure is named `p' !!! 193 1.1 jtc */ 194 1.1 jtc #define PEEK() (*p->next) 195 1.1 jtc #define PEEK2() (*(p->next+1)) 196 1.1 jtc #define MORE() (p->next < p->end) 197 1.1 jtc #define MORE2() (p->next+1 < p->end) 198 1.1 jtc #define SEE(c) (MORE() && PEEK() == (c)) 199 1.1 jtc #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 200 1.39 christos #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 201 1.1 jtc #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 202 1.1 jtc #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 203 1.39 christos #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 204 1.1 jtc #define NEXT() (p->next++) 205 1.1 jtc #define NEXT2() (p->next += 2) 206 1.1 jtc #define NEXTn(n) (p->next += (n)) 207 1.1 jtc #define GETNEXT() (*p->next++) 208 1.39 christos #define WGETNEXT() wgetnext(p) 209 1.1 jtc #define SETERROR(e) seterr(p, (e)) 210 1.39 christos #define REQUIRE(co, e) ((co) || SETERROR(e)) 211 1.1 jtc #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 212 1.39 christos #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 213 1.1 jtc #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 214 1.40 christos #define EMIT(op, sopnd) doemit(p, (op), (sopnd)) 215 1.40 christos #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos) 216 1.12 drochner #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 217 1.1 jtc #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 218 1.1 jtc #define HERE() (p->slen) 219 1.1 jtc #define THERE() (p->slen - 1) 220 1.4 jtc #define THERETHERE() (p->slen - 2) 221 1.1 jtc #define DROP(n) (p->slen -= (n)) 222 1.1 jtc 223 1.39 christos /* Macro used by computejump()/computematchjump() */ 224 1.41 christos #ifndef MIN 225 1.39 christos #define MIN(a,b) ((a)<(b)?(a):(b)) 226 1.41 christos #endif 227 1.30 christos 228 1.42 christos #ifndef NLS 229 1.42 christos static const struct { 230 1.42 christos const char *name; 231 1.42 christos int (*func)(int); 232 1.42 christos } wctypes[] = { 233 1.42 christos #define ADD(x) { .name = # x, .func = is ## x } 234 1.42 christos ADD(alnum), 235 1.42 christos ADD(alpha), 236 1.42 christos ADD(blank), 237 1.42 christos ADD(cntrl), 238 1.42 christos ADD(digit), 239 1.42 christos ADD(graph), 240 1.42 christos ADD(lower), 241 1.42 christos ADD(print), 242 1.42 christos ADD(punct), 243 1.42 christos ADD(space), 244 1.42 christos ADD(upper), 245 1.42 christos ADD(xdigit), 246 1.42 christos #undef ADD 247 1.42 christos }; 248 1.42 christos 249 1.42 christos wctype_t 250 1.42 christos __regex_wctype(const char *str) 251 1.42 christos { 252 1.42 christos for (size_t i = 0; i < __arraycount(wctypes); i++) { 253 1.42 christos if (strcmp(wctypes[i].name, str) == 0) 254 1.42 christos return (wctype_t)(i + 1); 255 1.42 christos } 256 1.42 christos return (wctype_t)0; 257 1.42 christos } 258 1.42 christos 259 1.42 christos int 260 1.42 christos __regex_iswctype(wint_t c, wctype_t ct) 261 1.42 christos { 262 1.42 christos if (ct == 0) 263 1.42 christos return 0; 264 1.42 christos return (*wctypes[ct - 1].func)(c); 265 1.42 christos } 266 1.42 christos #endif 267 1.42 christos 268 1.39 christos static int /* 0 success, otherwise REG_something */ 269 1.39 christos regcomp_internal(regex_t * __restrict preg, 270 1.39 christos const char * __restrict pattern, 271 1.39 christos int cflags, int pflags) 272 1.1 jtc { 273 1.1 jtc struct parse pa; 274 1.9 perry struct re_guts *g; 275 1.9 perry struct parse *p = &pa; 276 1.9 perry int i; 277 1.9 perry size_t len; 278 1.39 christos size_t maxlen; 279 1.3 jtc #ifdef REDEBUG 280 1.3 jtc # define GOODFLAGS(f) (f) 281 1.3 jtc #else 282 1.3 jtc # define GOODFLAGS(f) ((f)&~REG_DUMP) 283 1.3 jtc #endif 284 1.1 jtc 285 1.14 lukem _DIAGASSERT(preg != NULL); 286 1.14 lukem _DIAGASSERT(pattern != NULL); 287 1.14 lukem 288 1.3 jtc cflags = GOODFLAGS(cflags); 289 1.1 jtc if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 290 1.1 jtc return(REG_INVARG); 291 1.1 jtc 292 1.1 jtc if (cflags®_PEND) { 293 1.1 jtc if (preg->re_endp < pattern) 294 1.1 jtc return(REG_INVARG); 295 1.1 jtc len = preg->re_endp - pattern; 296 1.1 jtc } else 297 1.11 christos len = strlen(pattern); 298 1.1 jtc 299 1.1 jtc /* do the mallocs early so failure handling is easy */ 300 1.39 christos g = malloc(sizeof(*g)); 301 1.1 jtc if (g == NULL) 302 1.1 jtc return(REG_ESPACE); 303 1.39 christos /* 304 1.39 christos * Limit the pattern space to avoid a 32-bit overflow on buffer 305 1.39 christos * extension. Also avoid any signed overflow in case of conversion 306 1.39 christos * so make the real limit based on a 31-bit overflow. 307 1.39 christos * 308 1.39 christos * Likely not applicable on 64-bit systems but handle the case 309 1.39 christos * generically (who are we to stop people from using ~715MB+ 310 1.39 christos * patterns?). 311 1.39 christos */ 312 1.40 christos maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3; 313 1.39 christos if (len >= maxlen) { 314 1.40 christos free(g); 315 1.39 christos return(REG_ESPACE); 316 1.39 christos } 317 1.40 christos p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */ 318 1.39 christos assert(p->ssize >= len); 319 1.39 christos 320 1.39 christos p->strip = calloc(p->ssize, sizeof(*p->strip)); 321 1.1 jtc p->slen = 0; 322 1.1 jtc if (p->strip == NULL) { 323 1.40 christos free(g); 324 1.1 jtc return(REG_ESPACE); 325 1.1 jtc } 326 1.1 jtc 327 1.1 jtc /* set things up */ 328 1.1 jtc p->g = g; 329 1.39 christos p->next = pattern; /* convenience; we do not modify it */ 330 1.1 jtc p->end = p->next + len; 331 1.1 jtc p->error = 0; 332 1.1 jtc p->ncsalloc = 0; 333 1.39 christos p->pflags = pflags; 334 1.1 jtc for (i = 0; i < NPAREN; i++) { 335 1.1 jtc p->pbegin[i] = 0; 336 1.1 jtc p->pend[i] = 0; 337 1.1 jtc } 338 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 339 1.39 christos if ((cflags & REG_GNU) == 0) { 340 1.39 christos p->gnuext = false; 341 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0; 342 1.39 christos } else 343 1.39 christos p->gnuext = p->allowbranch = true; 344 1.39 christos #else 345 1.39 christos p->gnuext = false; 346 1.39 christos p->allowbranch = (cflags & REG_EXTENDED) != 0; 347 1.39 christos #endif 348 1.39 christos if (cflags & REG_EXTENDED) { 349 1.39 christos p->bre = false; 350 1.39 christos p->parse_expr = p_ere_exp; 351 1.39 christos p->pre_parse = NULL; 352 1.39 christos p->post_parse = NULL; 353 1.39 christos } else { 354 1.39 christos p->bre = true; 355 1.39 christos p->parse_expr = p_simp_re; 356 1.39 christos p->pre_parse = p_bre_pre_parse; 357 1.39 christos p->post_parse = p_bre_post_parse; 358 1.39 christos } 359 1.1 jtc g->sets = NULL; 360 1.1 jtc g->ncsets = 0; 361 1.1 jtc g->cflags = cflags; 362 1.1 jtc g->iflags = 0; 363 1.1 jtc g->nbol = 0; 364 1.1 jtc g->neol = 0; 365 1.1 jtc g->must = NULL; 366 1.39 christos g->moffset = -1; 367 1.39 christos g->charjump = NULL; 368 1.39 christos g->matchjump = NULL; 369 1.1 jtc g->mlen = 0; 370 1.1 jtc g->nsub = 0; 371 1.1 jtc g->backrefs = 0; 372 1.1 jtc 373 1.1 jtc /* do it */ 374 1.1 jtc EMIT(OEND, 0); 375 1.1 jtc g->firststate = THERE(); 376 1.39 christos if (cflags & REG_NOSPEC) 377 1.1 jtc p_str(p); 378 1.1 jtc else 379 1.39 christos p_re(p, OUT, OUT); 380 1.1 jtc EMIT(OEND, 0); 381 1.1 jtc g->laststate = THERE(); 382 1.1 jtc 383 1.1 jtc /* tidy up loose ends and fill things in */ 384 1.1 jtc stripsnug(p, g); 385 1.1 jtc findmust(p, g); 386 1.39 christos /* only use Boyer-Moore algorithm if the pattern is bigger 387 1.39 christos * than three characters 388 1.39 christos */ 389 1.39 christos if(g->mlen > 3) { 390 1.39 christos computejumps(p, g); 391 1.39 christos computematchjumps(p, g); 392 1.39 christos if(g->matchjump == NULL && g->charjump != NULL) { 393 1.39 christos free(g->charjump); 394 1.39 christos g->charjump = NULL; 395 1.39 christos } 396 1.39 christos } 397 1.1 jtc g->nplus = pluscount(p, g); 398 1.1 jtc g->magic = MAGIC2; 399 1.1 jtc preg->re_nsub = g->nsub; 400 1.1 jtc preg->re_g = g; 401 1.1 jtc preg->re_magic = MAGIC1; 402 1.1 jtc #ifndef REDEBUG 403 1.1 jtc /* not debugging, so can't rely on the assert() in regexec() */ 404 1.1 jtc if (g->iflags&BAD) 405 1.1 jtc SETERROR(REG_ASSERT); 406 1.1 jtc #endif 407 1.1 jtc 408 1.1 jtc /* win or lose, we're done */ 409 1.1 jtc if (p->error != 0) /* lose */ 410 1.1 jtc regfree(preg); 411 1.1 jtc return(p->error); 412 1.1 jtc } 413 1.1 jtc 414 1.1 jtc /* 415 1.39 christos - regcomp - interface for parser and compilation 416 1.39 christos = extern int regcomp(regex_t *, const char *, int); 417 1.39 christos = #define REG_BASIC 0000 418 1.39 christos = #define REG_EXTENDED 0001 419 1.39 christos = #define REG_ICASE 0002 420 1.39 christos = #define REG_NOSUB 0004 421 1.39 christos = #define REG_NEWLINE 0010 422 1.39 christos = #define REG_NOSPEC 0020 423 1.39 christos = #define REG_PEND 0040 424 1.39 christos = #define REG_DUMP 0200 425 1.1 jtc */ 426 1.39 christos int /* 0 success, otherwise REG_something */ 427 1.39 christos regcomp(regex_t * __restrict preg, 428 1.39 christos const char * __restrict pattern, 429 1.39 christos int cflags) 430 1.1 jtc { 431 1.30 christos 432 1.39 christos return (regcomp_internal(preg, pattern, cflags, 0)); 433 1.1 jtc } 434 1.1 jtc 435 1.1 jtc /* 436 1.39 christos - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 437 1.39 christos - return whether we should terminate or not 438 1.39 christos == static bool p_ere_exp(struct parse *p); 439 1.1 jtc */ 440 1.39 christos static bool 441 1.39 christos p_ere_exp(struct parse *p, struct branchc *bc) 442 1.1 jtc { 443 1.9 perry char c; 444 1.39 christos wint_t wc; 445 1.9 perry sopno pos; 446 1.9 perry int count; 447 1.9 perry int count2; 448 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 449 1.40 christos size_t i; 450 1.39 christos int handled; 451 1.39 christos #endif 452 1.9 perry sopno subno; 453 1.1 jtc int wascaret = 0; 454 1.1 jtc 455 1.14 lukem _DIAGASSERT(p != NULL); 456 1.14 lukem 457 1.39 christos (void)bc; 458 1.1 jtc assert(MORE()); /* caller should have ensured this */ 459 1.1 jtc c = GETNEXT(); 460 1.1 jtc 461 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 462 1.39 christos handled = 0; 463 1.39 christos #endif 464 1.1 jtc pos = HERE(); 465 1.1 jtc switch (c) { 466 1.1 jtc case '(': 467 1.39 christos (void)REQUIRE(MORE(), REG_EPAREN); 468 1.1 jtc p->g->nsub++; 469 1.40 christos subno = (sopno)p->g->nsub; 470 1.1 jtc if (subno < NPAREN) 471 1.1 jtc p->pbegin[subno] = HERE(); 472 1.1 jtc EMIT(OLPAREN, subno); 473 1.1 jtc if (!SEE(')')) 474 1.39 christos p_re(p, ')', IGN); 475 1.1 jtc if (subno < NPAREN) { 476 1.1 jtc p->pend[subno] = HERE(); 477 1.1 jtc assert(p->pend[subno] != 0); 478 1.1 jtc } 479 1.1 jtc EMIT(ORPAREN, subno); 480 1.39 christos (void)MUSTEAT(')', REG_EPAREN); 481 1.1 jtc break; 482 1.1 jtc #ifndef POSIX_MISTAKE 483 1.1 jtc case ')': /* happens only if no current unmatched ( */ 484 1.1 jtc /* 485 1.1 jtc * You may ask, why the ifndef? Because I didn't notice 486 1.1 jtc * this until slightly too late for 1003.2, and none of the 487 1.1 jtc * other 1003.2 regular-expression reviewers noticed it at 488 1.1 jtc * all. So an unmatched ) is legal POSIX, at least until 489 1.1 jtc * we can get it fixed. 490 1.1 jtc */ 491 1.1 jtc SETERROR(REG_EPAREN); 492 1.1 jtc break; 493 1.1 jtc #endif 494 1.1 jtc case '^': 495 1.1 jtc EMIT(OBOL, 0); 496 1.1 jtc p->g->iflags |= USEBOL; 497 1.1 jtc p->g->nbol++; 498 1.1 jtc wascaret = 1; 499 1.1 jtc break; 500 1.1 jtc case '$': 501 1.1 jtc EMIT(OEOL, 0); 502 1.1 jtc p->g->iflags |= USEEOL; 503 1.1 jtc p->g->neol++; 504 1.1 jtc break; 505 1.1 jtc case '|': 506 1.1 jtc SETERROR(REG_EMPTY); 507 1.1 jtc break; 508 1.1 jtc case '*': 509 1.1 jtc case '+': 510 1.1 jtc case '?': 511 1.39 christos case '{': 512 1.1 jtc SETERROR(REG_BADRPT); 513 1.1 jtc break; 514 1.1 jtc case '.': 515 1.1 jtc if (p->g->cflags®_NEWLINE) 516 1.1 jtc nonnewline(p); 517 1.1 jtc else 518 1.1 jtc EMIT(OANY, 0); 519 1.1 jtc break; 520 1.1 jtc case '[': 521 1.1 jtc p_bracket(p); 522 1.1 jtc break; 523 1.1 jtc case '\\': 524 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE); 525 1.39 christos wc = WGETNEXT(); 526 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 527 1.39 christos if (p->gnuext) { 528 1.39 christos handled = 1; 529 1.39 christos switch (wc) { 530 1.39 christos case '`': 531 1.39 christos EMIT(OBOS, 0); 532 1.39 christos break; 533 1.39 christos case '\'': 534 1.39 christos EMIT(OEOS, 0); 535 1.39 christos break; 536 1.39 christos case 'B': 537 1.39 christos EMIT(ONWBND, 0); 538 1.39 christos break; 539 1.39 christos case 'b': 540 1.39 christos EMIT(OWBND, 0); 541 1.39 christos break; 542 1.39 christos case 'W': 543 1.39 christos case 'w': 544 1.39 christos case 'S': 545 1.39 christos case 's': 546 1.39 christos p_b_pseudoclass(p, wc); 547 1.39 christos break; 548 1.46 christos case 'a': 549 1.46 christos ordinary(p, '\a'); 550 1.46 christos break; 551 1.46 christos case 'e': 552 1.46 christos ordinary(p, '\e'); 553 1.46 christos break; 554 1.46 christos case 'f': 555 1.46 christos ordinary(p, '\f'); 556 1.46 christos break; 557 1.46 christos case 'n': 558 1.46 christos ordinary(p, '\n'); 559 1.46 christos break; 560 1.46 christos case 'r': 561 1.46 christos ordinary(p, '\r'); 562 1.46 christos break; 563 1.46 christos case 't': 564 1.46 christos ordinary(p, '\t'); 565 1.46 christos break; 566 1.46 christos case 'v': 567 1.46 christos ordinary(p, '\v'); 568 1.46 christos break; 569 1.39 christos case '1': 570 1.39 christos case '2': 571 1.39 christos case '3': 572 1.39 christos case '4': 573 1.39 christos case '5': 574 1.39 christos case '6': 575 1.39 christos case '7': 576 1.39 christos case '8': 577 1.39 christos case '9': 578 1.39 christos i = wc - '0'; 579 1.39 christos assert(i < NPAREN); 580 1.39 christos if (p->pend[i] != 0) { 581 1.39 christos assert(i <= p->g->nsub); 582 1.39 christos EMIT(OBACK_, i); 583 1.39 christos assert(p->pbegin[i] != 0); 584 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 585 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN); 586 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 587 1.39 christos EMIT(O_BACK, i); 588 1.39 christos } else 589 1.39 christos SETERROR(REG_ESUBREG); 590 1.39 christos p->g->backrefs = 1; 591 1.39 christos break; 592 1.39 christos default: 593 1.39 christos handled = 0; 594 1.39 christos } 595 1.39 christos /* Don't proceed to the POSIX bits if we've already handled it */ 596 1.39 christos if (handled) 597 1.39 christos break; 598 1.39 christos } 599 1.39 christos #endif 600 1.39 christos switch (wc) { 601 1.39 christos case '<': 602 1.39 christos EMIT(OBOW, 0); 603 1.39 christos break; 604 1.39 christos case '>': 605 1.39 christos EMIT(OEOW, 0); 606 1.39 christos break; 607 1.39 christos default: 608 1.39 christos if (may_escape(p, wc)) 609 1.39 christos ordinary(p, wc); 610 1.39 christos else 611 1.39 christos SETERROR(REG_EESCAPE); 612 1.39 christos break; 613 1.39 christos } 614 1.1 jtc break; 615 1.1 jtc default: 616 1.38 christos if (p->error != 0) 617 1.39 christos return (false); 618 1.39 christos p->next--; 619 1.39 christos wc = WGETNEXT(); 620 1.39 christos ordinary(p, wc); 621 1.1 jtc break; 622 1.1 jtc } 623 1.1 jtc 624 1.1 jtc if (!MORE()) 625 1.39 christos return (false); 626 1.1 jtc c = PEEK(); 627 1.1 jtc /* we call { a repetition if followed by a digit */ 628 1.39 christos if (!( c == '*' || c == '+' || c == '?' || c == '{')) 629 1.39 christos return (false); /* no repetition, we're done */ 630 1.39 christos else if (c == '{') 631 1.39 christos (void)REQUIRE(MORE2() && \ 632 1.39 christos (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 633 1.1 jtc NEXT(); 634 1.1 jtc 635 1.39 christos (void)REQUIRE(!wascaret, REG_BADRPT); 636 1.1 jtc switch (c) { 637 1.1 jtc case '*': /* implemented as +? */ 638 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */ 639 1.1 jtc INSERT(OPLUS_, pos); 640 1.1 jtc ASTERN(O_PLUS, pos); 641 1.1 jtc INSERT(OQUEST_, pos); 642 1.1 jtc ASTERN(O_QUEST, pos); 643 1.1 jtc break; 644 1.1 jtc case '+': 645 1.1 jtc INSERT(OPLUS_, pos); 646 1.1 jtc ASTERN(O_PLUS, pos); 647 1.1 jtc break; 648 1.1 jtc case '?': 649 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 650 1.4 jtc INSERT(OCH_, pos); /* offset slightly wrong */ 651 1.4 jtc ASTERN(OOR1, pos); /* this one's right */ 652 1.4 jtc AHEAD(pos); /* fix the OCH_ */ 653 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */ 654 1.4 jtc AHEAD(THERE()); /* ...so fix it */ 655 1.4 jtc ASTERN(O_CH, THERETHERE()); 656 1.1 jtc break; 657 1.1 jtc case '{': 658 1.1 jtc count = p_count(p); 659 1.1 jtc if (EAT(',')) { 660 1.39 christos if (isdigit((uch)PEEK())) { 661 1.1 jtc count2 = p_count(p); 662 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR); 663 1.1 jtc } else /* single number with comma */ 664 1.1 jtc count2 = INFINITY; 665 1.1 jtc } else /* just a single number */ 666 1.1 jtc count2 = count; 667 1.39 christos repeat(p, pos, count, count2); 668 1.1 jtc if (!EAT('}')) { /* error heuristics */ 669 1.1 jtc while (MORE() && PEEK() != '}') 670 1.1 jtc NEXT(); 671 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE); 672 1.1 jtc SETERROR(REG_BADBR); 673 1.1 jtc } 674 1.1 jtc break; 675 1.1 jtc } 676 1.1 jtc 677 1.1 jtc if (!MORE()) 678 1.39 christos return (false); 679 1.1 jtc c = PEEK(); 680 1.1 jtc if (!( c == '*' || c == '+' || c == '?' || 681 1.39 christos (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 682 1.39 christos return (false); 683 1.1 jtc SETERROR(REG_BADRPT); 684 1.39 christos return (false); 685 1.1 jtc } 686 1.1 jtc 687 1.1 jtc /* 688 1.1 jtc - p_str - string (no metacharacters) "parser" 689 1.9 perry == static void p_str(struct parse *p); 690 1.1 jtc */ 691 1.1 jtc static void 692 1.39 christos p_str(struct parse *p) 693 1.1 jtc { 694 1.39 christos (void)REQUIRE(MORE(), REG_EMPTY); 695 1.39 christos while (MORE()) 696 1.39 christos ordinary(p, WGETNEXT()); 697 1.39 christos } 698 1.14 lukem 699 1.39 christos /* 700 1.39 christos * Eat consecutive branch delimiters for the kind of expression that we are 701 1.39 christos * parsing, return the number of delimiters that we ate. 702 1.39 christos */ 703 1.39 christos static int 704 1.39 christos p_branch_eat_delim(struct parse *p, struct branchc *bc) 705 1.39 christos { 706 1.39 christos int nskip; 707 1.14 lukem 708 1.39 christos (void)bc; 709 1.39 christos nskip = 0; 710 1.39 christos while (EATSPEC('|')) 711 1.39 christos ++nskip; 712 1.39 christos return (nskip); 713 1.1 jtc } 714 1.1 jtc 715 1.1 jtc /* 716 1.39 christos * Insert necessary branch book-keeping operations. This emits a 717 1.39 christos * bogus 'next' offset, since we still have more to parse 718 1.1 jtc */ 719 1.1 jtc static void 720 1.39 christos p_branch_ins_offset(struct parse *p, struct branchc *bc) 721 1.9 perry { 722 1.1 jtc 723 1.39 christos if (bc->nbranch == 0) { 724 1.39 christos INSERT(OCH_, bc->start); /* offset is wrong */ 725 1.39 christos bc->fwd = bc->start; 726 1.39 christos bc->back = bc->start; 727 1.39 christos } 728 1.39 christos 729 1.39 christos ASTERN(OOR1, bc->back); 730 1.39 christos bc->back = THERE(); 731 1.39 christos AHEAD(bc->fwd); /* fix previous offset */ 732 1.39 christos bc->fwd = HERE(); 733 1.39 christos EMIT(OOR2, 0); /* offset is very wrong */ 734 1.39 christos ++bc->nbranch; 735 1.39 christos } 736 1.39 christos 737 1.39 christos /* 738 1.39 christos * Fix the offset of the tail branch, if we actually had any branches. 739 1.39 christos * This is to correct the bogus placeholder offset that we use. 740 1.39 christos */ 741 1.39 christos static void 742 1.39 christos p_branch_fix_tail(struct parse *p, struct branchc *bc) 743 1.39 christos { 744 1.14 lukem 745 1.39 christos /* Fix bogus offset at the tail if we actually have branches */ 746 1.39 christos if (bc->nbranch > 0) { 747 1.39 christos AHEAD(bc->fwd); 748 1.39 christos ASTERN(O_CH, bc->back); 749 1.30 christos } 750 1.39 christos } 751 1.39 christos 752 1.39 christos /* 753 1.39 christos * Signal to the parser that an empty branch has been encountered; this will, 754 1.39 christos * in the future, be used to allow for more permissive behavior with empty 755 1.39 christos * branches. The return value should indicate whether parsing may continue 756 1.39 christos * or not. 757 1.39 christos */ 758 1.39 christos static bool 759 1.39 christos p_branch_empty(struct parse *p, struct branchc *bc) 760 1.39 christos { 761 1.39 christos 762 1.39 christos (void)bc; 763 1.39 christos SETERROR(REG_EMPTY); 764 1.39 christos return (false); 765 1.39 christos } 766 1.39 christos 767 1.39 christos /* 768 1.39 christos * Take care of any branching requirements. This includes inserting the 769 1.39 christos * appropriate branching instructions as well as eating all of the branch 770 1.39 christos * delimiters until we either run out of pattern or need to parse more pattern. 771 1.39 christos */ 772 1.39 christos static bool 773 1.39 christos p_branch_do(struct parse *p, struct branchc *bc) 774 1.39 christos { 775 1.39 christos int ate = 0; 776 1.39 christos 777 1.39 christos ate = p_branch_eat_delim(p, bc); 778 1.39 christos if (ate == 0) 779 1.39 christos return (false); 780 1.39 christos else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 781 1.39 christos /* 782 1.39 christos * Halt parsing only if we have an empty branch and p_branch_empty 783 1.39 christos * indicates that we must not continue. In the future, this will not 784 1.39 christos * necessarily be an error. 785 1.39 christos */ 786 1.39 christos return (false); 787 1.39 christos p_branch_ins_offset(p, bc); 788 1.39 christos 789 1.39 christos return (true); 790 1.39 christos } 791 1.30 christos 792 1.39 christos static void 793 1.39 christos p_bre_pre_parse(struct parse *p, struct branchc *bc) 794 1.39 christos { 795 1.14 lukem 796 1.40 christos (void)bc; 797 1.39 christos /* 798 1.39 christos * Does not move cleanly into expression parser because of 799 1.39 christos * ordinary interpration of * at the beginning position of 800 1.39 christos * an expression. 801 1.39 christos */ 802 1.1 jtc if (EAT('^')) { 803 1.1 jtc EMIT(OBOL, 0); 804 1.1 jtc p->g->iflags |= USEBOL; 805 1.1 jtc p->g->nbol++; 806 1.1 jtc } 807 1.39 christos } 808 1.39 christos 809 1.39 christos static void 810 1.39 christos p_bre_post_parse(struct parse *p, struct branchc *bc) 811 1.39 christos { 812 1.39 christos 813 1.39 christos /* Expression is terminating due to EOL token */ 814 1.39 christos if (bc->terminate) { 815 1.1 jtc DROP(1); 816 1.1 jtc EMIT(OEOL, 0); 817 1.1 jtc p->g->iflags |= USEEOL; 818 1.1 jtc p->g->neol++; 819 1.1 jtc } 820 1.39 christos } 821 1.1 jtc 822 1.39 christos /* 823 1.39 christos - p_re - Top level parser, concatenation and BRE anchoring 824 1.39 christos == static void p_re(struct parse *p, int end1, int end2); 825 1.39 christos * Giving end1 as OUT essentially eliminates the end1/end2 check. 826 1.39 christos * 827 1.39 christos * This implementation is a bit of a kludge, in that a trailing $ is first 828 1.39 christos * taken as an ordinary character and then revised to be an anchor. 829 1.39 christos * The amount of lookahead needed to avoid this kludge is excessive. 830 1.39 christos */ 831 1.39 christos static void 832 1.39 christos p_re(struct parse *p, 833 1.39 christos int end1, /* first terminating character */ 834 1.39 christos int end2) /* second terminating character; ignored for EREs */ 835 1.39 christos { 836 1.39 christos struct branchc bc; 837 1.39 christos 838 1.39 christos bc.nbranch = 0; 839 1.39 christos if (end1 == OUT && end2 == OUT) 840 1.39 christos bc.outer = true; 841 1.39 christos else 842 1.39 christos bc.outer = false; 843 1.39 christos #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 844 1.39 christos for (;;) { 845 1.39 christos bc.start = HERE(); 846 1.39 christos bc.nchain = 0; 847 1.39 christos bc.terminate = false; 848 1.39 christos if (p->pre_parse != NULL) 849 1.39 christos p->pre_parse(p, &bc); 850 1.39 christos while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 851 1.39 christos bc.terminate = p->parse_expr(p, &bc); 852 1.39 christos ++bc.nchain; 853 1.39 christos } 854 1.39 christos if (p->post_parse != NULL) 855 1.39 christos p->post_parse(p, &bc); 856 1.39 christos (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 857 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 858 1.39 christos if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc)) 859 1.39 christos break; 860 1.39 christos #endif 861 1.39 christos if (!p->allowbranch) 862 1.39 christos break; 863 1.39 christos /* 864 1.39 christos * p_branch_do's return value indicates whether we should 865 1.39 christos * continue parsing or not. This is both for correctness and 866 1.39 christos * a slight optimization, because it will check if we've 867 1.39 christos * encountered an empty branch or the end of the string 868 1.39 christos * immediately following a branch delimiter. 869 1.39 christos */ 870 1.39 christos if (!p_branch_do(p, &bc)) 871 1.39 christos break; 872 1.39 christos } 873 1.39 christos #undef SEE_END 874 1.39 christos if (p->allowbranch) 875 1.39 christos p_branch_fix_tail(p, &bc); 876 1.39 christos assert(!MORE() || SEE(end1)); 877 1.1 jtc } 878 1.1 jtc 879 1.1 jtc /* 880 1.1 jtc - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 881 1.39 christos == static bool p_simp_re(struct parse *p, struct branchc *bc); 882 1.1 jtc */ 883 1.39 christos static bool /* was the simple RE an unbackslashed $? */ 884 1.39 christos p_simp_re(struct parse *p, struct branchc *bc) 885 1.1 jtc { 886 1.9 perry int c; 887 1.39 christos int cc; /* convenient/control character */ 888 1.9 perry int count; 889 1.9 perry int count2; 890 1.39 christos sopno pos; 891 1.39 christos bool handled; 892 1.40 christos size_t i; 893 1.39 christos wint_t wc; 894 1.9 perry sopno subno; 895 1.1 jtc # define BACKSL (1<<CHAR_BIT) 896 1.1 jtc 897 1.39 christos pos = HERE(); /* repetition op, if any, covers from here */ 898 1.39 christos handled = false; 899 1.1 jtc 900 1.1 jtc assert(MORE()); /* caller should have ensured this */ 901 1.48 christos c = (uch)GETNEXT(); 902 1.1 jtc if (c == '\\') { 903 1.39 christos (void)REQUIRE(MORE(), REG_EESCAPE); 904 1.48 christos cc = (uch)GETNEXT(); 905 1.39 christos c = BACKSL | cc; 906 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 907 1.39 christos if (p->gnuext) { 908 1.39 christos handled = true; 909 1.39 christos switch (c) { 910 1.39 christos case BACKSL|'`': 911 1.39 christos EMIT(OBOS, 0); 912 1.39 christos break; 913 1.39 christos case BACKSL|'\'': 914 1.39 christos EMIT(OEOS, 0); 915 1.39 christos break; 916 1.39 christos case BACKSL|'B': 917 1.39 christos EMIT(ONWBND, 0); 918 1.39 christos break; 919 1.39 christos case BACKSL|'b': 920 1.39 christos EMIT(OWBND, 0); 921 1.39 christos break; 922 1.39 christos case BACKSL|'W': 923 1.39 christos case BACKSL|'w': 924 1.39 christos case BACKSL|'S': 925 1.39 christos case BACKSL|'s': 926 1.39 christos p_b_pseudoclass(p, cc); 927 1.39 christos break; 928 1.46 christos case BACKSL|'a': 929 1.46 christos ordinary(p, '\a'); 930 1.46 christos break; 931 1.46 christos case BACKSL|'e': 932 1.46 christos ordinary(p, '\e'); 933 1.46 christos break; 934 1.46 christos case BACKSL|'f': 935 1.46 christos ordinary(p, '\f'); 936 1.46 christos break; 937 1.46 christos case BACKSL|'n': 938 1.46 christos ordinary(p, '\n'); 939 1.46 christos break; 940 1.46 christos case BACKSL|'r': 941 1.46 christos ordinary(p, '\r'); 942 1.46 christos break; 943 1.46 christos case BACKSL|'t': 944 1.46 christos ordinary(p, '\t'); 945 1.46 christos break; 946 1.46 christos case BACKSL|'v': 947 1.46 christos ordinary(p, '\v'); 948 1.46 christos break; 949 1.39 christos default: 950 1.39 christos handled = false; 951 1.39 christos } 952 1.39 christos } 953 1.39 christos #endif 954 1.1 jtc } 955 1.39 christos if (!handled) { 956 1.39 christos switch (c) { 957 1.39 christos case '.': 958 1.39 christos if (p->g->cflags®_NEWLINE) 959 1.39 christos nonnewline(p); 960 1.39 christos else 961 1.39 christos EMIT(OANY, 0); 962 1.39 christos break; 963 1.39 christos case '[': 964 1.39 christos p_bracket(p); 965 1.39 christos break; 966 1.39 christos case BACKSL|'<': 967 1.39 christos EMIT(OBOW, 0); 968 1.39 christos break; 969 1.39 christos case BACKSL|'>': 970 1.39 christos EMIT(OEOW, 0); 971 1.39 christos break; 972 1.39 christos case BACKSL|'{': 973 1.39 christos SETERROR(REG_BADRPT); 974 1.39 christos break; 975 1.39 christos case BACKSL|'(': 976 1.39 christos p->g->nsub++; 977 1.40 christos subno = (sopno)p->g->nsub; 978 1.39 christos if (subno < NPAREN) 979 1.39 christos p->pbegin[subno] = HERE(); 980 1.39 christos EMIT(OLPAREN, subno); 981 1.39 christos /* the MORE here is an error heuristic */ 982 1.39 christos if (MORE() && !SEETWO('\\', ')')) 983 1.39 christos p_re(p, '\\', ')'); 984 1.39 christos if (subno < NPAREN) { 985 1.39 christos p->pend[subno] = HERE(); 986 1.39 christos assert(p->pend[subno] != 0); 987 1.39 christos } 988 1.39 christos EMIT(ORPAREN, subno); 989 1.39 christos (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 990 1.39 christos break; 991 1.39 christos case BACKSL|')': /* should not get here -- must be user */ 992 1.39 christos SETERROR(REG_EPAREN); 993 1.39 christos break; 994 1.39 christos case BACKSL|'1': 995 1.39 christos case BACKSL|'2': 996 1.39 christos case BACKSL|'3': 997 1.39 christos case BACKSL|'4': 998 1.39 christos case BACKSL|'5': 999 1.39 christos case BACKSL|'6': 1000 1.39 christos case BACKSL|'7': 1001 1.39 christos case BACKSL|'8': 1002 1.39 christos case BACKSL|'9': 1003 1.39 christos i = (c&~BACKSL) - '0'; 1004 1.39 christos assert(i < NPAREN); 1005 1.39 christos if (p->pend[i] != 0) { 1006 1.39 christos assert(i <= p->g->nsub); 1007 1.39 christos EMIT(OBACK_, i); 1008 1.39 christos assert(p->pbegin[i] != 0); 1009 1.39 christos assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 1010 1.39 christos assert(OP(p->strip[p->pend[i]]) == ORPAREN); 1011 1.39 christos (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 1012 1.39 christos EMIT(O_BACK, i); 1013 1.39 christos } else 1014 1.39 christos SETERROR(REG_ESUBREG); 1015 1.39 christos p->g->backrefs = 1; 1016 1.39 christos break; 1017 1.39 christos case '*': 1018 1.39 christos /* 1019 1.39 christos * Ordinary if used as the first character beyond BOL anchor of 1020 1.39 christos * a (sub-)expression, counts as a bad repetition operator if it 1021 1.39 christos * appears otherwise. 1022 1.39 christos */ 1023 1.39 christos (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 1024 1.39 christos /* FALLTHROUGH */ 1025 1.39 christos default: 1026 1.39 christos if (p->error != 0) 1027 1.39 christos return (false); /* Definitely not $... */ 1028 1.39 christos p->next--; 1029 1.39 christos wc = WGETNEXT(); 1030 1.39 christos if ((c & BACKSL) == 0 || may_escape(p, wc)) 1031 1.39 christos ordinary(p, wc); 1032 1.39 christos else 1033 1.39 christos SETERROR(REG_EESCAPE); 1034 1.39 christos break; 1035 1.1 jtc } 1036 1.1 jtc } 1037 1.1 jtc 1038 1.1 jtc if (EAT('*')) { /* implemented as +? */ 1039 1.4 jtc /* this case does not require the (y|) trick, noKLUDGE */ 1040 1.1 jtc INSERT(OPLUS_, pos); 1041 1.1 jtc ASTERN(O_PLUS, pos); 1042 1.1 jtc INSERT(OQUEST_, pos); 1043 1.1 jtc ASTERN(O_QUEST, pos); 1044 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 1045 1.39 christos } else if (p->gnuext && EATTWO('\\', '?')) { 1046 1.39 christos INSERT(OQUEST_, pos); 1047 1.39 christos ASTERN(O_QUEST, pos); 1048 1.39 christos } else if (p->gnuext && EATTWO('\\', '+')) { 1049 1.39 christos INSERT(OPLUS_, pos); 1050 1.39 christos ASTERN(O_PLUS, pos); 1051 1.39 christos #endif 1052 1.1 jtc } else if (EATTWO('\\', '{')) { 1053 1.1 jtc count = p_count(p); 1054 1.1 jtc if (EAT(',')) { 1055 1.39 christos if (MORE() && isdigit((uch)PEEK())) { 1056 1.1 jtc count2 = p_count(p); 1057 1.39 christos (void)REQUIRE(count <= count2, REG_BADBR); 1058 1.1 jtc } else /* single number with comma */ 1059 1.1 jtc count2 = INFINITY; 1060 1.1 jtc } else /* just a single number */ 1061 1.1 jtc count2 = count; 1062 1.39 christos repeat(p, pos, count, count2); 1063 1.1 jtc if (!EATTWO('\\', '}')) { /* error heuristics */ 1064 1.1 jtc while (MORE() && !SEETWO('\\', '}')) 1065 1.1 jtc NEXT(); 1066 1.39 christos (void)REQUIRE(MORE(), REG_EBRACE); 1067 1.1 jtc SETERROR(REG_BADBR); 1068 1.1 jtc } 1069 1.39 christos } else if (c == '$') /* $ (but not \$) ends it */ 1070 1.39 christos return (true); 1071 1.1 jtc 1072 1.39 christos return (false); 1073 1.1 jtc } 1074 1.1 jtc 1075 1.1 jtc /* 1076 1.1 jtc - p_count - parse a repetition count 1077 1.9 perry == static int p_count(struct parse *p); 1078 1.1 jtc */ 1079 1.1 jtc static int /* the value */ 1080 1.39 christos p_count(struct parse *p) 1081 1.1 jtc { 1082 1.9 perry int count = 0; 1083 1.9 perry int ndigits = 0; 1084 1.1 jtc 1085 1.39 christos while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1086 1.48 christos count = count*10 + ((uch)GETNEXT() - '0'); 1087 1.1 jtc ndigits++; 1088 1.1 jtc } 1089 1.1 jtc 1090 1.39 christos (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1091 1.1 jtc return(count); 1092 1.1 jtc } 1093 1.1 jtc 1094 1.1 jtc /* 1095 1.1 jtc - p_bracket - parse a bracketed character list 1096 1.9 perry == static void p_bracket(struct parse *p); 1097 1.1 jtc */ 1098 1.1 jtc static void 1099 1.39 christos p_bracket(struct parse *p) 1100 1.1 jtc { 1101 1.14 lukem cset *cs; 1102 1.39 christos wint_t ch; 1103 1.14 lukem 1104 1.1 jtc /* Dept of Truly Sickening Special-Case Kludges */ 1105 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 1106 1.1 jtc EMIT(OBOW, 0); 1107 1.1 jtc NEXTn(6); 1108 1.1 jtc return; 1109 1.1 jtc } 1110 1.39 christos if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 1111 1.1 jtc EMIT(OEOW, 0); 1112 1.1 jtc NEXTn(6); 1113 1.1 jtc return; 1114 1.1 jtc } 1115 1.1 jtc 1116 1.39 christos if ((cs = allocset(p)) == NULL) 1117 1.39 christos return; 1118 1.39 christos 1119 1.39 christos if (p->g->cflags®_ICASE) 1120 1.39 christos cs->icase = 1; 1121 1.1 jtc if (EAT('^')) 1122 1.39 christos cs->invert = 1; 1123 1.1 jtc if (EAT(']')) 1124 1.39 christos CHadd(p, cs, ']'); 1125 1.1 jtc else if (EAT('-')) 1126 1.39 christos CHadd(p, cs, '-'); 1127 1.1 jtc while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1128 1.1 jtc p_b_term(p, cs); 1129 1.1 jtc if (EAT('-')) 1130 1.39 christos CHadd(p, cs, '-'); 1131 1.39 christos (void)MUSTEAT(']', REG_EBRACK); 1132 1.1 jtc 1133 1.1 jtc if (p->error != 0) /* don't mess things up further */ 1134 1.1 jtc return; 1135 1.1 jtc 1136 1.39 christos if (cs->invert && p->g->cflags®_NEWLINE) 1137 1.39 christos cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1138 1.1 jtc 1139 1.39 christos if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1140 1.39 christos ordinary(p, ch); 1141 1.39 christos freeset(p, cs); 1142 1.39 christos } else 1143 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1144 1.39 christos } 1145 1.1 jtc 1146 1.39 christos static int 1147 1.39 christos p_range_cmp(wchar_t c1, wchar_t c2) 1148 1.39 christos { 1149 1.39 christos #ifdef REGEX_LIBC_COLLATE 1150 1.39 christos return __wcollate_range_cmp(c1, c2); 1151 1.42 christos #elif defined(NLS) 1152 1.39 christos /* Copied from libc/collate __wcollate_range_cmp */ 1153 1.39 christos wchar_t s1[2], s2[2]; 1154 1.1 jtc 1155 1.39 christos s1[0] = c1; 1156 1.39 christos s1[1] = L'\0'; 1157 1.39 christos s2[0] = c2; 1158 1.39 christos s2[1] = L'\0'; 1159 1.42 christos return wcscoll(s1, s2); 1160 1.42 christos #else 1161 1.42 christos char s1[2], s2[2]; 1162 1.42 christos 1163 1.42 christos s1[0] = (char)c1; 1164 1.42 christos s1[1] = '\0'; 1165 1.42 christos s2[0] = (char)c2; 1166 1.42 christos s2[1] = '\0'; 1167 1.42 christos return strcoll(s1, s2); 1168 1.39 christos #endif 1169 1.1 jtc } 1170 1.1 jtc 1171 1.1 jtc /* 1172 1.1 jtc - p_b_term - parse one term of a bracketed character list 1173 1.9 perry == static void p_b_term(struct parse *p, cset *cs); 1174 1.1 jtc */ 1175 1.1 jtc static void 1176 1.39 christos p_b_term(struct parse *p, cset *cs) 1177 1.1 jtc { 1178 1.9 perry char c; 1179 1.39 christos wint_t start, finish; 1180 1.39 christos wint_t i; 1181 1.39 christos #ifdef REGEX_LIBC_COLLATE 1182 1.39 christos struct xlocale_collate *table = 1183 1.39 christos (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1184 1.39 christos #endif 1185 1.1 jtc 1186 1.14 lukem _DIAGASSERT(p != NULL); 1187 1.14 lukem _DIAGASSERT(cs != NULL); 1188 1.14 lukem 1189 1.1 jtc /* classify what we've got */ 1190 1.1 jtc switch ((MORE()) ? PEEK() : '\0') { 1191 1.1 jtc case '[': 1192 1.1 jtc c = (MORE2()) ? PEEK2() : '\0'; 1193 1.1 jtc break; 1194 1.1 jtc case '-': 1195 1.1 jtc SETERROR(REG_ERANGE); 1196 1.1 jtc return; /* NOTE RETURN */ 1197 1.1 jtc default: 1198 1.1 jtc c = '\0'; 1199 1.1 jtc break; 1200 1.1 jtc } 1201 1.1 jtc 1202 1.1 jtc switch (c) { 1203 1.1 jtc case ':': /* character class */ 1204 1.1 jtc NEXT2(); 1205 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK); 1206 1.1 jtc c = PEEK(); 1207 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1208 1.1 jtc p_b_cclass(p, cs); 1209 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK); 1210 1.39 christos (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1211 1.1 jtc break; 1212 1.1 jtc case '=': /* equivalence class */ 1213 1.1 jtc NEXT2(); 1214 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK); 1215 1.1 jtc c = PEEK(); 1216 1.39 christos (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1217 1.1 jtc p_b_eclass(p, cs); 1218 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK); 1219 1.39 christos (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1220 1.1 jtc break; 1221 1.1 jtc default: /* symbol, ordinary character, or range */ 1222 1.1 jtc start = p_b_symbol(p); 1223 1.1 jtc if (SEE('-') && MORE2() && PEEK2() != ']') { 1224 1.1 jtc /* range */ 1225 1.1 jtc NEXT(); 1226 1.1 jtc if (EAT('-')) 1227 1.1 jtc finish = '-'; 1228 1.1 jtc else 1229 1.1 jtc finish = p_b_symbol(p); 1230 1.1 jtc } else 1231 1.1 jtc finish = start; 1232 1.39 christos if (start == finish) 1233 1.39 christos CHadd(p, cs, start); 1234 1.39 christos else { 1235 1.39 christos #ifdef REGEX_LIBC_COLLATE 1236 1.39 christos if (table->__collate_load_error || MB_CUR_MAX > 1) { 1237 1.39 christos #else 1238 1.39 christos if (MB_CUR_MAX > 1) { 1239 1.39 christos #endif 1240 1.39 christos (void)REQUIRE(start <= finish, REG_ERANGE); 1241 1.39 christos CHaddrange(p, cs, start, finish); 1242 1.39 christos } else { 1243 1.39 christos (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1244 1.39 christos for (i = 0; i <= UCHAR_MAX; i++) { 1245 1.39 christos if (p_range_cmp(start, i) <= 0 && 1246 1.39 christos p_range_cmp(i, finish) <= 0 ) 1247 1.39 christos CHadd(p, cs, i); 1248 1.39 christos } 1249 1.39 christos } 1250 1.39 christos } 1251 1.39 christos break; 1252 1.39 christos } 1253 1.39 christos } 1254 1.39 christos 1255 1.39 christos #ifdef REGEX_GNU_EXTENSIONS 1256 1.39 christos /* 1257 1.39 christos - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1258 1.39 christos == static int p_b_pseudoclass(struct parse *p, char c) 1259 1.39 christos */ 1260 1.39 christos static int 1261 1.39 christos p_b_pseudoclass(struct parse *p, char c) { 1262 1.39 christos cset *cs; 1263 1.39 christos 1264 1.39 christos if ((cs = allocset(p)) == NULL) 1265 1.39 christos return(0); 1266 1.39 christos 1267 1.39 christos if (p->g->cflags®_ICASE) 1268 1.39 christos cs->icase = 1; 1269 1.39 christos 1270 1.39 christos switch (c) { 1271 1.39 christos case 'W': 1272 1.39 christos cs->invert = 1; 1273 1.39 christos /* FALLTHROUGH */ 1274 1.39 christos case 'w': 1275 1.39 christos p_b_cclass_named(p, cs, "alnum"); 1276 1.1 jtc break; 1277 1.39 christos case 'S': 1278 1.39 christos cs->invert = 1; 1279 1.39 christos /* FALLTHROUGH */ 1280 1.39 christos case 's': 1281 1.39 christos p_b_cclass_named(p, cs, "space"); 1282 1.39 christos break; 1283 1.39 christos default: 1284 1.39 christos return(0); 1285 1.1 jtc } 1286 1.39 christos 1287 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1288 1.39 christos return(1); 1289 1.1 jtc } 1290 1.39 christos #endif 1291 1.1 jtc 1292 1.1 jtc /* 1293 1.1 jtc - p_b_cclass - parse a character-class name and deal with it 1294 1.9 perry == static void p_b_cclass(struct parse *p, cset *cs); 1295 1.1 jtc */ 1296 1.1 jtc static void 1297 1.39 christos p_b_cclass(struct parse *p, cset *cs) 1298 1.1 jtc { 1299 1.39 christos const char *sp = p->next; 1300 1.9 perry size_t len; 1301 1.39 christos char clname[16]; 1302 1.1 jtc 1303 1.39 christos while (MORE() && isalpha((uch)PEEK())) 1304 1.1 jtc NEXT(); 1305 1.1 jtc len = p->next - sp; 1306 1.39 christos if (len >= sizeof(clname) - 1) { 1307 1.1 jtc SETERROR(REG_ECTYPE); 1308 1.1 jtc return; 1309 1.1 jtc } 1310 1.39 christos memcpy(clname, sp, len); 1311 1.39 christos clname[len] = '\0'; 1312 1.39 christos 1313 1.39 christos p_b_cclass_named(p, cs, clname); 1314 1.39 christos } 1315 1.42 christos 1316 1.39 christos /* 1317 1.39 christos - p_b_cclass_named - deal with a named character class 1318 1.39 christos == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1319 1.39 christos */ 1320 1.39 christos static void 1321 1.39 christos p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1322 1.39 christos wctype_t wct; 1323 1.1 jtc 1324 1.39 christos if ((wct = wctype(clname)) == 0) { 1325 1.39 christos SETERROR(REG_ECTYPE); 1326 1.39 christos return; 1327 1.39 christos } 1328 1.39 christos CHaddtype(p, cs, wct); 1329 1.1 jtc } 1330 1.1 jtc 1331 1.1 jtc /* 1332 1.1 jtc - p_b_eclass - parse an equivalence-class name and deal with it 1333 1.9 perry == static void p_b_eclass(struct parse *p, cset *cs); 1334 1.1 jtc * 1335 1.1 jtc * This implementation is incomplete. xxx 1336 1.1 jtc */ 1337 1.1 jtc static void 1338 1.39 christos p_b_eclass(struct parse *p, cset *cs) 1339 1.1 jtc { 1340 1.39 christos wint_t c; 1341 1.1 jtc 1342 1.14 lukem _DIAGASSERT(p != NULL); 1343 1.14 lukem _DIAGASSERT(cs != NULL); 1344 1.14 lukem 1345 1.1 jtc c = p_b_coll_elem(p, '='); 1346 1.39 christos CHadd(p, cs, c); 1347 1.1 jtc } 1348 1.1 jtc 1349 1.1 jtc /* 1350 1.1 jtc - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1351 1.39 christos == static wint_t p_b_symbol(struct parse *p); 1352 1.1 jtc */ 1353 1.39 christos static wint_t /* value of symbol */ 1354 1.39 christos p_b_symbol(struct parse *p) 1355 1.1 jtc { 1356 1.39 christos wint_t value; 1357 1.1 jtc 1358 1.14 lukem _DIAGASSERT(p != NULL); 1359 1.14 lukem 1360 1.39 christos (void)REQUIRE(MORE(), REG_EBRACK); 1361 1.1 jtc if (!EATTWO('[', '.')) 1362 1.39 christos return(WGETNEXT()); 1363 1.1 jtc 1364 1.1 jtc /* collating symbol */ 1365 1.1 jtc value = p_b_coll_elem(p, '.'); 1366 1.39 christos (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1367 1.1 jtc return(value); 1368 1.1 jtc } 1369 1.1 jtc 1370 1.1 jtc /* 1371 1.1 jtc - p_b_coll_elem - parse a collating-element name and look it up 1372 1.39 christos == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1373 1.1 jtc */ 1374 1.39 christos static wint_t /* value of collating element */ 1375 1.39 christos p_b_coll_elem(struct parse *p, 1376 1.39 christos wint_t endc) /* name ended by endc,']' */ 1377 1.39 christos { 1378 1.39 christos const char *sp = p->next; 1379 1.39 christos struct cname *cp; 1380 1.42 christos size_t len; 1381 1.1 jtc 1382 1.14 lukem _DIAGASSERT(p != NULL); 1383 1.14 lukem 1384 1.1 jtc while (MORE() && !SEETWO(endc, ']')) 1385 1.1 jtc NEXT(); 1386 1.1 jtc if (!MORE()) { 1387 1.1 jtc SETERROR(REG_EBRACK); 1388 1.1 jtc return(0); 1389 1.1 jtc } 1390 1.1 jtc len = p->next - sp; 1391 1.1 jtc for (cp = cnames; cp->name != NULL; cp++) 1392 1.37 christos if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1393 1.1 jtc return(cp->code); /* known name */ 1394 1.42 christos #ifdef NLS 1395 1.42 christos mbstate_t mbs; 1396 1.42 christos wchar_t wc; 1397 1.42 christos size_t clen; 1398 1.42 christos 1399 1.39 christos memset(&mbs, 0, sizeof(mbs)); 1400 1.39 christos if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1401 1.39 christos return (wc); /* single character */ 1402 1.39 christos else if (clen == (size_t)-1 || clen == (size_t)-2) 1403 1.39 christos SETERROR(REG_ILLSEQ); 1404 1.39 christos else 1405 1.39 christos SETERROR(REG_ECOLLATE); /* neither */ 1406 1.1 jtc return(0); 1407 1.42 christos #else 1408 1.42 christos if (len == 1) 1409 1.42 christos return *sp; /* single character */ 1410 1.42 christos SETERROR(REG_ECOLLATE); /* neither */ 1411 1.42 christos return 0; 1412 1.42 christos #endif 1413 1.1 jtc } 1414 1.1 jtc 1415 1.1 jtc /* 1416 1.39 christos - may_escape - determine whether 'ch' is escape-able in the current context 1417 1.39 christos == static int may_escape(struct parse *p, const wint_t ch) 1418 1.39 christos */ 1419 1.39 christos static bool 1420 1.39 christos may_escape(struct parse *p, const wint_t ch) 1421 1.39 christos { 1422 1.39 christos 1423 1.39 christos if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1424 1.39 christos return (true); 1425 1.48 christos if (iswalpha(ch) || ch == '\'' || ch == '`') 1426 1.39 christos return (false); 1427 1.39 christos return (true); 1428 1.39 christos #ifdef NOTYET 1429 1.39 christos /* 1430 1.39 christos * Build a whitelist of characters that may be escaped to produce an 1431 1.39 christos * ordinary in the current context. This assumes that these have not 1432 1.39 christos * been otherwise interpreted as a special character. Escaping an 1433 1.39 christos * ordinary character yields undefined results according to 1434 1.39 christos * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1435 1.39 christos * advantage of this and use escaped ordinary characters to provide 1436 1.39 christos * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1437 1.39 christos */ 1438 1.39 christos switch(ch) { 1439 1.39 christos case '|': 1440 1.39 christos case '+': 1441 1.39 christos case '?': 1442 1.39 christos /* The above characters may not be escaped in BREs */ 1443 1.39 christos if (!(p->g->cflags®_EXTENDED)) 1444 1.39 christos return (false); 1445 1.39 christos /* Fallthrough */ 1446 1.39 christos case '(': 1447 1.39 christos case ')': 1448 1.39 christos case '{': 1449 1.39 christos case '}': 1450 1.39 christos case '.': 1451 1.39 christos case '[': 1452 1.39 christos case ']': 1453 1.39 christos case '\\': 1454 1.39 christos case '*': 1455 1.39 christos case '^': 1456 1.39 christos case '$': 1457 1.39 christos return (true); 1458 1.39 christos default: 1459 1.39 christos return (false); 1460 1.39 christos } 1461 1.39 christos #endif 1462 1.39 christos } 1463 1.39 christos 1464 1.39 christos /* 1465 1.1 jtc - othercase - return the case counterpart of an alphabetic 1466 1.39 christos == static wint_t othercase(wint_t ch); 1467 1.1 jtc */ 1468 1.39 christos static wint_t /* if no counterpart, return ch */ 1469 1.39 christos othercase(wint_t ch) 1470 1.39 christos { 1471 1.39 christos assert(iswalpha(ch)); 1472 1.39 christos if (iswupper(ch)) 1473 1.39 christos return(towlower(ch)); 1474 1.39 christos else if (iswlower(ch)) 1475 1.39 christos return(towupper(ch)); 1476 1.1 jtc else /* peculiar, but could happen */ 1477 1.1 jtc return(ch); 1478 1.1 jtc } 1479 1.1 jtc 1480 1.1 jtc /* 1481 1.1 jtc - bothcases - emit a dualcase version of a two-case character 1482 1.39 christos == static void bothcases(struct parse *p, wint_t ch); 1483 1.1 jtc * 1484 1.1 jtc * Boy, is this implementation ever a kludge... 1485 1.1 jtc */ 1486 1.1 jtc static void 1487 1.39 christos bothcases(struct parse *p, wint_t ch) 1488 1.1 jtc { 1489 1.39 christos const char *oldnext = p->next; 1490 1.39 christos const char *oldend = p->end; 1491 1.39 christos char bracket[3 + MB_LEN_MAX]; 1492 1.39 christos size_t n; 1493 1.1 jtc 1494 1.14 lukem _DIAGASSERT(p != NULL); 1495 1.14 lukem 1496 1.1 jtc assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1497 1.1 jtc p->next = bracket; 1498 1.42 christos #ifdef NLS 1499 1.42 christos mbstate_t mbs; 1500 1.39 christos memset(&mbs, 0, sizeof(mbs)); 1501 1.39 christos n = wcrtomb(bracket, ch, &mbs); 1502 1.39 christos assert(n != (size_t)-1); 1503 1.42 christos #else 1504 1.42 christos n = 0; 1505 1.42 christos bracket[n++] = ch; 1506 1.42 christos #endif 1507 1.39 christos bracket[n] = ']'; 1508 1.39 christos bracket[n + 1] = '\0'; 1509 1.39 christos p->end = bracket+n+1; 1510 1.1 jtc p_bracket(p); 1511 1.39 christos assert(p->next == p->end); 1512 1.1 jtc p->next = oldnext; 1513 1.1 jtc p->end = oldend; 1514 1.1 jtc } 1515 1.1 jtc 1516 1.1 jtc /* 1517 1.1 jtc - ordinary - emit an ordinary character 1518 1.39 christos == static void ordinary(struct parse *p, wint_t ch); 1519 1.1 jtc */ 1520 1.1 jtc static void 1521 1.39 christos ordinary(struct parse *p, wint_t ch) 1522 1.1 jtc { 1523 1.39 christos cset *cs; 1524 1.1 jtc 1525 1.14 lukem _DIAGASSERT(p != NULL); 1526 1.14 lukem 1527 1.39 christos if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1528 1.39 christos bothcases(p, ch); 1529 1.40 christos else if ((wint_t)(ch & OPDMASK) == ch) 1530 1.40 christos EMIT(OCHAR, (size_t)ch); 1531 1.1 jtc else { 1532 1.39 christos /* 1533 1.39 christos * Kludge: character is too big to fit into an OCHAR operand. 1534 1.39 christos * Emit a singleton set. 1535 1.39 christos */ 1536 1.39 christos if ((cs = allocset(p)) == NULL) 1537 1.39 christos return; 1538 1.39 christos CHadd(p, cs, ch); 1539 1.40 christos EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1540 1.1 jtc } 1541 1.1 jtc } 1542 1.1 jtc 1543 1.1 jtc /* 1544 1.1 jtc - nonnewline - emit REG_NEWLINE version of OANY 1545 1.9 perry == static void nonnewline(struct parse *p); 1546 1.1 jtc * 1547 1.1 jtc * Boy, is this implementation ever a kludge... 1548 1.1 jtc */ 1549 1.1 jtc static void 1550 1.39 christos nonnewline(struct parse *p) 1551 1.1 jtc { 1552 1.39 christos const char *oldnext = p->next; 1553 1.39 christos const char *oldend = p->end; 1554 1.1 jtc char bracket[4]; 1555 1.1 jtc 1556 1.14 lukem _DIAGASSERT(p != NULL); 1557 1.14 lukem 1558 1.1 jtc p->next = bracket; 1559 1.1 jtc p->end = bracket+3; 1560 1.1 jtc bracket[0] = '^'; 1561 1.1 jtc bracket[1] = '\n'; 1562 1.1 jtc bracket[2] = ']'; 1563 1.1 jtc bracket[3] = '\0'; 1564 1.1 jtc p_bracket(p); 1565 1.1 jtc assert(p->next == bracket+3); 1566 1.1 jtc p->next = oldnext; 1567 1.1 jtc p->end = oldend; 1568 1.1 jtc } 1569 1.1 jtc 1570 1.1 jtc /* 1571 1.1 jtc - repeat - generate code for a bounded repetition, recursively if needed 1572 1.39 christos == static void repeat(struct parse *p, sopno start, int from, int to); 1573 1.1 jtc */ 1574 1.1 jtc static void 1575 1.39 christos repeat(struct parse *p, 1576 1.39 christos sopno start, /* operand from here to end of strip */ 1577 1.39 christos int from, /* repeated from this number */ 1578 1.39 christos int to) /* to this number of times (maybe INFINITY) */ 1579 1.1 jtc { 1580 1.39 christos sopno finish = HERE(); 1581 1.1 jtc # define N 2 1582 1.1 jtc # define INF 3 1583 1.1 jtc # define REP(f, t) ((f)*8 + (t)) 1584 1.1 jtc # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1585 1.9 perry sopno copy; 1586 1.1 jtc 1587 1.14 lukem _DIAGASSERT(p != NULL); 1588 1.14 lukem 1589 1.39 christos if (p->error != 0) /* head off possible runaway recursion */ 1590 1.30 christos return; 1591 1.30 christos 1592 1.1 jtc assert(from <= to); 1593 1.1 jtc 1594 1.1 jtc switch (REP(MAP(from), MAP(to))) { 1595 1.1 jtc case REP(0, 0): /* must be user doing this */ 1596 1.1 jtc DROP(finish-start); /* drop the operand */ 1597 1.1 jtc break; 1598 1.1 jtc case REP(0, 1): /* as x{1,1}? */ 1599 1.1 jtc case REP(0, N): /* as x{1,n}? */ 1600 1.1 jtc case REP(0, INF): /* as x{1,}? */ 1601 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1602 1.4 jtc INSERT(OCH_, start); /* offset is wrong... */ 1603 1.39 christos repeat(p, start+1, 1, to); 1604 1.4 jtc ASTERN(OOR1, start); 1605 1.1 jtc AHEAD(start); /* ... fix it */ 1606 1.4 jtc EMIT(OOR2, 0); 1607 1.4 jtc AHEAD(THERE()); 1608 1.4 jtc ASTERN(O_CH, THERETHERE()); 1609 1.1 jtc break; 1610 1.1 jtc case REP(1, 1): /* trivial case */ 1611 1.1 jtc /* done */ 1612 1.1 jtc break; 1613 1.1 jtc case REP(1, N): /* as x?x{1,n-1} */ 1614 1.4 jtc /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1615 1.4 jtc INSERT(OCH_, start); 1616 1.4 jtc ASTERN(OOR1, start); 1617 1.4 jtc AHEAD(start); 1618 1.4 jtc EMIT(OOR2, 0); /* offset very wrong... */ 1619 1.4 jtc AHEAD(THERE()); /* ...so fix it */ 1620 1.4 jtc ASTERN(O_CH, THERETHERE()); 1621 1.1 jtc copy = dupl(p, start+1, finish+1); 1622 1.4 jtc assert(copy == finish+4); 1623 1.39 christos repeat(p, copy, 1, to-1); 1624 1.1 jtc break; 1625 1.1 jtc case REP(1, INF): /* as x+ */ 1626 1.1 jtc INSERT(OPLUS_, start); 1627 1.1 jtc ASTERN(O_PLUS, start); 1628 1.1 jtc break; 1629 1.1 jtc case REP(N, N): /* as xx{m-1,n-1} */ 1630 1.1 jtc copy = dupl(p, start, finish); 1631 1.39 christos repeat(p, copy, from-1, to-1); 1632 1.1 jtc break; 1633 1.1 jtc case REP(N, INF): /* as xx{n-1,INF} */ 1634 1.1 jtc copy = dupl(p, start, finish); 1635 1.39 christos repeat(p, copy, from-1, to); 1636 1.1 jtc break; 1637 1.1 jtc default: /* "can't happen" */ 1638 1.1 jtc SETERROR(REG_ASSERT); /* just in case */ 1639 1.1 jtc break; 1640 1.1 jtc } 1641 1.1 jtc } 1642 1.1 jtc 1643 1.1 jtc /* 1644 1.39 christos - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1645 1.39 christos - character from the parse struct, signals a REG_ILLSEQ error if the 1646 1.39 christos - character can't be converted. Returns the number of bytes consumed. 1647 1.39 christos */ 1648 1.39 christos static wint_t 1649 1.39 christos wgetnext(struct parse *p) 1650 1.39 christos { 1651 1.42 christos #ifdef NLS 1652 1.39 christos mbstate_t mbs; 1653 1.39 christos wchar_t wc; 1654 1.39 christos size_t n; 1655 1.39 christos 1656 1.39 christos memset(&mbs, 0, sizeof(mbs)); 1657 1.40 christos n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs); 1658 1.39 christos if (n == (size_t)-1 || n == (size_t)-2) { 1659 1.39 christos SETERROR(REG_ILLSEQ); 1660 1.39 christos return (0); 1661 1.39 christos } 1662 1.39 christos if (n == 0) 1663 1.39 christos n = 1; 1664 1.39 christos p->next += n; 1665 1.42 christos return wc; 1666 1.42 christos #else 1667 1.42 christos return *p->next++; 1668 1.42 christos #endif 1669 1.39 christos } 1670 1.39 christos 1671 1.39 christos /* 1672 1.1 jtc - seterr - set an error condition 1673 1.9 perry == static int seterr(struct parse *p, int e); 1674 1.1 jtc */ 1675 1.1 jtc static int /* useless but makes type checking happy */ 1676 1.39 christos seterr(struct parse *p, int e) 1677 1.1 jtc { 1678 1.14 lukem 1679 1.14 lukem _DIAGASSERT(p != NULL); 1680 1.14 lukem 1681 1.1 jtc if (p->error == 0) /* keep earliest error condition */ 1682 1.1 jtc p->error = e; 1683 1.1 jtc p->next = nuls; /* try to bring things to a halt */ 1684 1.1 jtc p->end = nuls; 1685 1.1 jtc return(0); /* make the return value well-defined */ 1686 1.1 jtc } 1687 1.1 jtc 1688 1.1 jtc /* 1689 1.1 jtc - allocset - allocate a set of characters for [] 1690 1.9 perry == static cset *allocset(struct parse *p); 1691 1.1 jtc */ 1692 1.1 jtc static cset * 1693 1.39 christos allocset(struct parse *p) 1694 1.1 jtc { 1695 1.39 christos cset *cs, *ncs; 1696 1.1 jtc 1697 1.14 lukem _DIAGASSERT(p != NULL); 1698 1.14 lukem 1699 1.39 christos ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1700 1.39 christos if (ncs == NULL) { 1701 1.39 christos SETERROR(REG_ESPACE); 1702 1.39 christos return (NULL); 1703 1.1 jtc } 1704 1.39 christos p->g->sets = ncs; 1705 1.39 christos cs = &p->g->sets[p->g->ncsets++]; 1706 1.39 christos memset(cs, 0, sizeof(*cs)); 1707 1.1 jtc 1708 1.1 jtc return(cs); 1709 1.1 jtc } 1710 1.1 jtc 1711 1.1 jtc /* 1712 1.1 jtc - freeset - free a now-unused set 1713 1.9 perry == static void freeset(struct parse *p, cset *cs); 1714 1.1 jtc */ 1715 1.1 jtc static void 1716 1.39 christos freeset(struct parse *p, cset *cs) 1717 1.1 jtc { 1718 1.14 lukem cset *top; 1719 1.14 lukem 1720 1.14 lukem _DIAGASSERT(p != NULL); 1721 1.14 lukem _DIAGASSERT(cs != NULL); 1722 1.14 lukem 1723 1.14 lukem top = &p->g->sets[p->g->ncsets]; 1724 1.1 jtc 1725 1.39 christos free(cs->wides); 1726 1.39 christos free(cs->ranges); 1727 1.39 christos free(cs->types); 1728 1.39 christos memset(cs, 0, sizeof(*cs)); 1729 1.1 jtc if (cs == top-1) /* recover only the easy case */ 1730 1.1 jtc p->g->ncsets--; 1731 1.1 jtc } 1732 1.1 jtc 1733 1.1 jtc /* 1734 1.39 christos - singleton - Determine whether a set contains only one character, 1735 1.39 christos - returning it if so, otherwise returning OUT. 1736 1.1 jtc */ 1737 1.39 christos static wint_t 1738 1.39 christos singleton(cset *cs) 1739 1.1 jtc { 1740 1.39 christos wint_t i, s, n; 1741 1.1 jtc 1742 1.39 christos for (i = n = 0; i < NC; i++) 1743 1.39 christos if (CHIN(cs, i)) { 1744 1.1 jtc n++; 1745 1.39 christos s = i; 1746 1.39 christos } 1747 1.39 christos if (n == 1) 1748 1.39 christos return (s); 1749 1.39 christos if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1750 1.39 christos cs->icase == 0) 1751 1.39 christos return (cs->wides[0]); 1752 1.39 christos /* Don't bother handling the other cases. */ 1753 1.39 christos return (OUT); 1754 1.1 jtc } 1755 1.1 jtc 1756 1.1 jtc /* 1757 1.39 christos - CHadd - add character to character set. 1758 1.1 jtc */ 1759 1.1 jtc static void 1760 1.39 christos CHadd(struct parse *p, cset *cs, wint_t ch) 1761 1.1 jtc { 1762 1.39 christos wint_t nch, *newwides; 1763 1.14 lukem 1764 1.14 lukem _DIAGASSERT(p != NULL); 1765 1.14 lukem _DIAGASSERT(cs != NULL); 1766 1.14 lukem 1767 1.49 christos if ((unsigned)ch < NC) 1768 1.40 christos cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7); 1769 1.39 christos else { 1770 1.39 christos newwides = reallocarray(cs->wides, cs->nwides + 1, 1771 1.39 christos sizeof(*cs->wides)); 1772 1.39 christos if (newwides == NULL) { 1773 1.39 christos SETERROR(REG_ESPACE); 1774 1.39 christos return; 1775 1.39 christos } 1776 1.39 christos cs->wides = newwides; 1777 1.39 christos cs->wides[cs->nwides++] = ch; 1778 1.39 christos } 1779 1.39 christos if (cs->icase) { 1780 1.49 christos if ((unsigned)(nch = towlower(ch)) < NC) 1781 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1782 1.49 christos if ((unsigned)(nch = towupper(ch)) < NC) 1783 1.40 christos cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1784 1.1 jtc } 1785 1.1 jtc } 1786 1.1 jtc 1787 1.1 jtc /* 1788 1.39 christos - CHaddrange - add all characters in the range [min,max] to a character set. 1789 1.1 jtc */ 1790 1.1 jtc static void 1791 1.39 christos CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1792 1.1 jtc { 1793 1.39 christos crange *newranges; 1794 1.14 lukem 1795 1.39 christos _DIAGASSERT(p != NULL); 1796 1.14 lukem _DIAGASSERT(cs != NULL); 1797 1.14 lukem 1798 1.39 christos for (; min < NC && min <= max; min++) 1799 1.39 christos CHadd(p, cs, min); 1800 1.39 christos if (min >= max) 1801 1.39 christos return; 1802 1.39 christos newranges = reallocarray(cs->ranges, cs->nranges + 1, 1803 1.39 christos sizeof(*cs->ranges)); 1804 1.39 christos if (newranges == NULL) { 1805 1.39 christos SETERROR(REG_ESPACE); 1806 1.1 jtc return; 1807 1.1 jtc } 1808 1.39 christos cs->ranges = newranges; 1809 1.39 christos cs->ranges[cs->nranges].min = min; 1810 1.39 christos cs->ranges[cs->nranges].max = max; 1811 1.39 christos cs->nranges++; 1812 1.1 jtc } 1813 1.1 jtc 1814 1.1 jtc /* 1815 1.39 christos - CHaddtype - add all characters of a certain type to a character set. 1816 1.1 jtc */ 1817 1.1 jtc static void 1818 1.39 christos CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1819 1.1 jtc { 1820 1.39 christos wint_t i; 1821 1.39 christos wctype_t *newtypes; 1822 1.14 lukem 1823 1.14 lukem _DIAGASSERT(p != NULL); 1824 1.14 lukem _DIAGASSERT(cs != NULL); 1825 1.14 lukem 1826 1.39 christos for (i = 0; i < NC; i++) 1827 1.39 christos if (iswctype(i, wct)) 1828 1.39 christos CHadd(p, cs, i); 1829 1.39 christos newtypes = reallocarray(cs->types, cs->ntypes + 1, 1830 1.39 christos sizeof(*cs->types)); 1831 1.39 christos if (newtypes == NULL) { 1832 1.39 christos SETERROR(REG_ESPACE); 1833 1.1 jtc return; 1834 1.39 christos } 1835 1.39 christos cs->types = newtypes; 1836 1.39 christos cs->types[cs->ntypes++] = wct; 1837 1.1 jtc } 1838 1.1 jtc 1839 1.1 jtc /* 1840 1.1 jtc - dupl - emit a duplicate of a bunch of sops 1841 1.9 perry == static sopno dupl(struct parse *p, sopno start, sopno finish); 1842 1.1 jtc */ 1843 1.1 jtc static sopno /* start of duplicate */ 1844 1.39 christos dupl(struct parse *p, 1845 1.39 christos sopno start, /* from here */ 1846 1.39 christos sopno finish) /* to this less one */ 1847 1.1 jtc { 1848 1.39 christos sopno ret = HERE(); 1849 1.9 perry sopno len = finish - start; 1850 1.1 jtc 1851 1.14 lukem _DIAGASSERT(p != NULL); 1852 1.14 lukem 1853 1.1 jtc assert(finish >= start); 1854 1.1 jtc if (len == 0) 1855 1.1 jtc return(ret); 1856 1.39 christos if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1857 1.39 christos return(ret); 1858 1.40 christos (void) memcpy(p->strip + p->slen, 1859 1.40 christos p->strip + start, len * sizeof(*p->strip)); 1860 1.1 jtc p->slen += len; 1861 1.1 jtc return(ret); 1862 1.1 jtc } 1863 1.1 jtc 1864 1.1 jtc /* 1865 1.1 jtc - doemit - emit a strip operator 1866 1.9 perry == static void doemit(struct parse *p, sop op, size_t opnd); 1867 1.1 jtc * 1868 1.1 jtc * It might seem better to implement this as a macro with a function as 1869 1.1 jtc * hard-case backup, but it's just too big and messy unless there are 1870 1.1 jtc * some changes to the data structures. Maybe later. 1871 1.1 jtc */ 1872 1.1 jtc static void 1873 1.39 christos doemit(struct parse *p, sop op, size_t opnd) 1874 1.1 jtc { 1875 1.1 jtc /* avoid making error situations worse */ 1876 1.1 jtc if (p->error != 0) 1877 1.1 jtc return; 1878 1.1 jtc 1879 1.39 christos _DIAGASSERT(p != NULL); 1880 1.39 christos 1881 1.1 jtc /* deal with oversize operands ("can't happen", more or less) */ 1882 1.1 jtc assert(opnd < 1<<OPSHIFT); 1883 1.1 jtc 1884 1.1 jtc /* deal with undersized strip */ 1885 1.1 jtc if (p->slen >= p->ssize) 1886 1.30 christos if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1887 1.30 christos return; 1888 1.1 jtc 1889 1.1 jtc /* finally, it's all reduced to the easy case */ 1890 1.40 christos p->strip[p->slen++] = (sopno)SOP(op, opnd); 1891 1.1 jtc } 1892 1.1 jtc 1893 1.1 jtc /* 1894 1.1 jtc - doinsert - insert a sop into the strip 1895 1.9 perry == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1896 1.1 jtc */ 1897 1.1 jtc static void 1898 1.39 christos doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1899 1.1 jtc { 1900 1.9 perry sopno sn; 1901 1.9 perry sop s; 1902 1.9 perry int i; 1903 1.1 jtc 1904 1.14 lukem _DIAGASSERT(p != NULL); 1905 1.14 lukem 1906 1.1 jtc /* avoid making error situations worse */ 1907 1.1 jtc if (p->error != 0) 1908 1.1 jtc return; 1909 1.1 jtc 1910 1.1 jtc sn = HERE(); 1911 1.1 jtc EMIT(op, opnd); /* do checks, ensure space */ 1912 1.1 jtc assert(HERE() == sn+1); 1913 1.1 jtc s = p->strip[sn]; 1914 1.1 jtc 1915 1.1 jtc /* adjust paren pointers */ 1916 1.1 jtc assert(pos > 0); 1917 1.1 jtc for (i = 1; i < NPAREN; i++) { 1918 1.1 jtc if (p->pbegin[i] >= pos) { 1919 1.1 jtc p->pbegin[i]++; 1920 1.1 jtc } 1921 1.1 jtc if (p->pend[i] >= pos) { 1922 1.1 jtc p->pend[i]++; 1923 1.1 jtc } 1924 1.1 jtc } 1925 1.1 jtc 1926 1.40 christos memmove(&p->strip[pos+1], &p->strip[pos], 1927 1.40 christos (HERE()-pos-1)*sizeof(*p->strip)); 1928 1.1 jtc p->strip[pos] = s; 1929 1.1 jtc } 1930 1.1 jtc 1931 1.1 jtc /* 1932 1.1 jtc - dofwd - complete a forward reference 1933 1.9 perry == static void dofwd(struct parse *p, sopno pos, sop value); 1934 1.1 jtc */ 1935 1.1 jtc static void 1936 1.39 christos dofwd(struct parse *p, sopno pos, sop value) 1937 1.1 jtc { 1938 1.14 lukem 1939 1.14 lukem _DIAGASSERT(p != NULL); 1940 1.14 lukem 1941 1.1 jtc /* avoid making error situations worse */ 1942 1.1 jtc if (p->error != 0) 1943 1.1 jtc return; 1944 1.1 jtc 1945 1.1 jtc assert(value < 1<<OPSHIFT); 1946 1.39 christos p->strip[pos] = OP(p->strip[pos]) | value; 1947 1.1 jtc } 1948 1.1 jtc 1949 1.1 jtc /* 1950 1.1 jtc - enlarge - enlarge the strip 1951 1.39 christos == static int enlarge(struct parse *p, sopno size); 1952 1.1 jtc */ 1953 1.30 christos static int 1954 1.35 joerg enlarge(struct parse *p, sopno size) 1955 1.1 jtc { 1956 1.39 christos sop *sp; 1957 1.39 christos 1958 1.14 lukem _DIAGASSERT(p != NULL); 1959 1.14 lukem 1960 1.1 jtc if (p->ssize >= size) 1961 1.30 christos return 1; 1962 1.1 jtc 1963 1.40 christos sp = reallocarray(p->strip, size, sizeof(*p->strip)); 1964 1.39 christos if (sp == NULL) { 1965 1.1 jtc SETERROR(REG_ESPACE); 1966 1.30 christos return 0; 1967 1.1 jtc } 1968 1.39 christos p->strip = sp; 1969 1.35 joerg p->ssize = size; 1970 1.30 christos return 1; 1971 1.1 jtc } 1972 1.1 jtc 1973 1.1 jtc /* 1974 1.1 jtc - stripsnug - compact the strip 1975 1.9 perry == static void stripsnug(struct parse *p, struct re_guts *g); 1976 1.1 jtc */ 1977 1.1 jtc static void 1978 1.39 christos stripsnug(struct parse *p, struct re_guts *g) 1979 1.1 jtc { 1980 1.14 lukem 1981 1.14 lukem _DIAGASSERT(p != NULL); 1982 1.14 lukem _DIAGASSERT(g != NULL); 1983 1.14 lukem 1984 1.1 jtc g->nstates = p->slen; 1985 1.40 christos g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip)); 1986 1.39 christos if (g->strip == NULL) { 1987 1.39 christos SETERROR(REG_ESPACE); 1988 1.39 christos g->strip = p->strip; 1989 1.39 christos } 1990 1.1 jtc } 1991 1.1 jtc 1992 1.1 jtc /* 1993 1.1 jtc - findmust - fill in must and mlen with longest mandatory literal string 1994 1.9 perry == static void findmust(struct parse *p, struct re_guts *g); 1995 1.1 jtc * 1996 1.1 jtc * This algorithm could do fancy things like analyzing the operands of | 1997 1.1 jtc * for common subsequences. Someday. This code is simple and finds most 1998 1.1 jtc * of the interesting cases. 1999 1.1 jtc * 2000 1.1 jtc * Note that must and mlen got initialized during setup. 2001 1.1 jtc */ 2002 1.1 jtc static void 2003 1.39 christos findmust(struct parse *p, struct re_guts *g) 2004 1.1 jtc { 2005 1.9 perry sop *scan; 2006 1.7 christos sop *start = NULL; 2007 1.9 perry sop *newstart = NULL; 2008 1.9 perry sopno newlen; 2009 1.9 perry sop s; 2010 1.9 perry char *cp; 2011 1.39 christos int offset; 2012 1.39 christos mbstate_t mbs; 2013 1.1 jtc 2014 1.14 lukem _DIAGASSERT(p != NULL); 2015 1.14 lukem _DIAGASSERT(g != NULL); 2016 1.14 lukem 2017 1.1 jtc /* avoid making error situations worse */ 2018 1.1 jtc if (p->error != 0) 2019 1.1 jtc return; 2020 1.1 jtc 2021 1.39 christos #ifdef notyet 2022 1.39 christos /* 2023 1.39 christos * It's not generally safe to do a ``char'' substring search on 2024 1.39 christos * multibyte character strings, but it's safe for at least 2025 1.39 christos * UTF-8 (see RFC 3629). 2026 1.39 christos */ 2027 1.39 christos if (MB_CUR_MAX > 1 && 2028 1.39 christos strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 2029 1.39 christos return; 2030 1.39 christos #endif 2031 1.39 christos 2032 1.1 jtc /* find the longest OCHAR sequence in strip */ 2033 1.1 jtc newlen = 0; 2034 1.39 christos offset = 0; 2035 1.39 christos g->moffset = 0; 2036 1.1 jtc scan = g->strip + 1; 2037 1.1 jtc do { 2038 1.1 jtc s = *scan++; 2039 1.1 jtc switch (OP(s)) { 2040 1.1 jtc case OCHAR: /* sequence member */ 2041 1.39 christos if (newlen == 0) { /* new sequence */ 2042 1.39 christos memset(&mbs, 0, sizeof(mbs)); 2043 1.1 jtc newstart = scan - 1; 2044 1.39 christos } 2045 1.42 christos #ifdef NLS 2046 1.42 christos char buf[MB_LEN_MAX]; 2047 1.42 christos size_t clen = wcrtomb(buf, (int)OPND(s), &mbs); 2048 1.39 christos if (clen == (size_t)-1) 2049 1.39 christos goto toohard; 2050 1.40 christos newlen += (sopno)clen; 2051 1.42 christos #else 2052 1.42 christos newlen++; 2053 1.42 christos #endif 2054 1.1 jtc break; 2055 1.1 jtc case OPLUS_: /* things that don't break one */ 2056 1.1 jtc case OLPAREN: 2057 1.1 jtc case ORPAREN: 2058 1.1 jtc break; 2059 1.1 jtc case OQUEST_: /* things that must be skipped */ 2060 1.1 jtc case OCH_: 2061 1.39 christos offset = altoffset(scan, offset); 2062 1.1 jtc scan--; 2063 1.1 jtc do { 2064 1.1 jtc scan += OPND(s); 2065 1.1 jtc s = *scan; 2066 1.1 jtc /* assert() interferes w debug printouts */ 2067 1.40 christos if (OP(s) != O_QUEST && 2068 1.40 christos OP(s) != O_CH && OP(s) != OOR2) { 2069 1.1 jtc g->iflags |= BAD; 2070 1.1 jtc return; 2071 1.1 jtc } 2072 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH); 2073 1.11 christos /* FALLTHROUGH */ 2074 1.39 christos case OBOW: /* things that break a sequence */ 2075 1.39 christos case OEOW: 2076 1.39 christos case OBOL: 2077 1.39 christos case OEOL: 2078 1.39 christos case OBOS: 2079 1.39 christos case OEOS: 2080 1.39 christos case OWBND: 2081 1.39 christos case ONWBND: 2082 1.39 christos case O_QUEST: 2083 1.39 christos case O_CH: 2084 1.39 christos case OEND: 2085 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */ 2086 1.1 jtc start = newstart; 2087 1.1 jtc g->mlen = newlen; 2088 1.39 christos if (offset > -1) { 2089 1.39 christos g->moffset += offset; 2090 1.39 christos offset = newlen; 2091 1.39 christos } else 2092 1.39 christos g->moffset = offset; 2093 1.39 christos } else { 2094 1.39 christos if (offset > -1) 2095 1.39 christos offset += newlen; 2096 1.1 jtc } 2097 1.1 jtc newlen = 0; 2098 1.1 jtc break; 2099 1.39 christos case OANY: 2100 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */ 2101 1.39 christos start = newstart; 2102 1.39 christos g->mlen = newlen; 2103 1.39 christos if (offset > -1) { 2104 1.39 christos g->moffset += offset; 2105 1.39 christos offset = newlen; 2106 1.39 christos } else 2107 1.39 christos g->moffset = offset; 2108 1.39 christos } else { 2109 1.39 christos if (offset > -1) 2110 1.39 christos offset += newlen; 2111 1.39 christos } 2112 1.39 christos if (offset > -1) 2113 1.39 christos offset++; 2114 1.39 christos newlen = 0; 2115 1.39 christos break; 2116 1.39 christos case OANYOF: /* may or may not invalidate offset */ 2117 1.39 christos /* First, everything as OANY */ 2118 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */ 2119 1.39 christos start = newstart; 2120 1.39 christos g->mlen = newlen; 2121 1.39 christos if (offset > -1) { 2122 1.39 christos g->moffset += offset; 2123 1.39 christos offset = newlen; 2124 1.39 christos } else 2125 1.39 christos g->moffset = offset; 2126 1.39 christos } else { 2127 1.39 christos if (offset > -1) 2128 1.39 christos offset += newlen; 2129 1.39 christos } 2130 1.39 christos if (offset > -1) 2131 1.39 christos offset++; 2132 1.39 christos newlen = 0; 2133 1.39 christos break; 2134 1.42 christos #ifdef NLS 2135 1.40 christos toohard:/*FALLTHROUGH*/ 2136 1.42 christos #endif 2137 1.39 christos default: 2138 1.39 christos /* Anything here makes it impossible or too hard 2139 1.39 christos * to calculate the offset -- so we give up; 2140 1.39 christos * save the last known good offset, in case the 2141 1.39 christos * must sequence doesn't occur later. 2142 1.39 christos */ 2143 1.39 christos if (newlen > (sopno)g->mlen) { /* ends one */ 2144 1.39 christos start = newstart; 2145 1.39 christos g->mlen = newlen; 2146 1.39 christos if (offset > -1) 2147 1.39 christos g->moffset += offset; 2148 1.39 christos else 2149 1.39 christos g->moffset = offset; 2150 1.39 christos } 2151 1.39 christos offset = -1; 2152 1.39 christos newlen = 0; 2153 1.39 christos break; 2154 1.1 jtc } 2155 1.1 jtc } while (OP(s) != OEND); 2156 1.1 jtc 2157 1.39 christos if (g->mlen == 0) { /* there isn't one */ 2158 1.39 christos g->moffset = -1; 2159 1.1 jtc return; 2160 1.39 christos } 2161 1.1 jtc 2162 1.1 jtc /* turn it into a character string */ 2163 1.1 jtc g->must = malloc((size_t)g->mlen + 1); 2164 1.1 jtc if (g->must == NULL) { /* argh; just forget it */ 2165 1.1 jtc g->mlen = 0; 2166 1.39 christos g->moffset = -1; 2167 1.1 jtc return; 2168 1.1 jtc } 2169 1.1 jtc cp = g->must; 2170 1.1 jtc scan = start; 2171 1.39 christos memset(&mbs, 0, sizeof(mbs)); 2172 1.39 christos while (cp < g->must + g->mlen) { 2173 1.1 jtc while (OP(s = *scan++) != OCHAR) 2174 1.1 jtc continue; 2175 1.42 christos #ifdef NLS 2176 1.43 christos size_t clen = wcrtomb(cp, (int)OPND(s), &mbs); 2177 1.39 christos assert(clen != (size_t)-1); 2178 1.39 christos cp += clen; 2179 1.42 christos #else 2180 1.42 christos *cp++ = OPND(s); 2181 1.42 christos #endif 2182 1.1 jtc } 2183 1.2 jtc assert(cp == g->must + g->mlen); 2184 1.1 jtc *cp++ = '\0'; /* just on general principles */ 2185 1.1 jtc } 2186 1.1 jtc 2187 1.1 jtc /* 2188 1.39 christos - altoffset - choose biggest offset among multiple choices 2189 1.39 christos == static int altoffset(sop *scan, int offset); 2190 1.39 christos * 2191 1.39 christos * Compute, recursively if necessary, the largest offset among multiple 2192 1.39 christos * re paths. 2193 1.39 christos */ 2194 1.39 christos static int 2195 1.39 christos altoffset(sop *scan, int offset) 2196 1.39 christos { 2197 1.39 christos int largest; 2198 1.39 christos int try; 2199 1.39 christos sop s; 2200 1.39 christos 2201 1.39 christos _DIAGASSERT(scan != NULL); 2202 1.39 christos 2203 1.39 christos /* If we gave up already on offsets, return */ 2204 1.39 christos if (offset == -1) 2205 1.39 christos return -1; 2206 1.39 christos 2207 1.39 christos largest = 0; 2208 1.39 christos try = 0; 2209 1.39 christos s = *scan++; 2210 1.40 christos while (OP(s) != O_QUEST && OP(s) != O_CH) { 2211 1.39 christos switch (OP(s)) { 2212 1.39 christos case OOR1: 2213 1.39 christos if (try > largest) 2214 1.39 christos largest = try; 2215 1.39 christos try = 0; 2216 1.39 christos break; 2217 1.39 christos case OQUEST_: 2218 1.39 christos case OCH_: 2219 1.39 christos try = altoffset(scan, try); 2220 1.39 christos if (try == -1) 2221 1.39 christos return -1; 2222 1.39 christos scan--; 2223 1.39 christos do { 2224 1.39 christos scan += OPND(s); 2225 1.39 christos s = *scan; 2226 1.40 christos if (OP(s) != O_QUEST && 2227 1.40 christos OP(s) != O_CH && OP(s) != OOR2) 2228 1.39 christos return -1; 2229 1.40 christos } while (OP(s) != O_QUEST && OP(s) != O_CH); 2230 1.39 christos /* We must skip to the next position, or we'll 2231 1.39 christos * leave altoffset() too early. 2232 1.39 christos */ 2233 1.39 christos scan++; 2234 1.39 christos break; 2235 1.39 christos case OANYOF: 2236 1.39 christos case OCHAR: 2237 1.39 christos case OANY: 2238 1.39 christos try++; 2239 1.40 christos /*FALLTHROUGH*/ 2240 1.39 christos case OBOW: 2241 1.39 christos case OEOW: 2242 1.39 christos case OWBND: 2243 1.39 christos case ONWBND: 2244 1.39 christos case OLPAREN: 2245 1.39 christos case ORPAREN: 2246 1.39 christos case OOR2: 2247 1.39 christos break; 2248 1.39 christos default: 2249 1.39 christos try = -1; 2250 1.39 christos break; 2251 1.39 christos } 2252 1.39 christos if (try == -1) 2253 1.39 christos return -1; 2254 1.39 christos s = *scan++; 2255 1.39 christos } 2256 1.39 christos 2257 1.39 christos if (try > largest) 2258 1.39 christos largest = try; 2259 1.39 christos 2260 1.39 christos return largest+offset; 2261 1.39 christos } 2262 1.39 christos 2263 1.39 christos /* 2264 1.39 christos - computejumps - compute char jumps for BM scan 2265 1.39 christos == static void computejumps(struct parse *p, struct re_guts *g); 2266 1.39 christos * 2267 1.39 christos * This algorithm assumes g->must exists and is has size greater than 2268 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by 2269 1.39 christos * Sara Baase. 2270 1.39 christos * 2271 1.39 christos * A char jump is the number of characters one needs to jump based on 2272 1.39 christos * the value of the character from the text that was mismatched. 2273 1.39 christos */ 2274 1.39 christos static void 2275 1.39 christos computejumps(struct parse *p, struct re_guts *g) 2276 1.39 christos { 2277 1.39 christos int ch; 2278 1.39 christos size_t mindex; 2279 1.39 christos 2280 1.39 christos _DIAGASSERT(p != NULL); 2281 1.39 christos _DIAGASSERT(g != NULL); 2282 1.39 christos 2283 1.39 christos /* Avoid making errors worse */ 2284 1.39 christos if (p->error != 0) 2285 1.39 christos return; 2286 1.39 christos 2287 1.39 christos g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump)); 2288 1.39 christos if (g->charjump == NULL) /* Not a fatal error */ 2289 1.39 christos return; 2290 1.39 christos /* Adjust for signed chars, if necessary */ 2291 1.39 christos g->charjump = &g->charjump[-(CHAR_MIN)]; 2292 1.39 christos 2293 1.39 christos /* If the character does not exist in the pattern, the jump 2294 1.39 christos * is equal to the number of characters in the pattern. 2295 1.39 christos */ 2296 1.39 christos for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2297 1.39 christos g->charjump[ch] = g->mlen; 2298 1.39 christos 2299 1.39 christos /* If the character does exist, compute the jump that would 2300 1.39 christos * take us to the last character in the pattern equal to it 2301 1.39 christos * (notice that we match right to left, so that last character 2302 1.39 christos * is the first one that would be matched). 2303 1.39 christos */ 2304 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++) 2305 1.39 christos g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2306 1.39 christos } 2307 1.39 christos 2308 1.39 christos /* 2309 1.39 christos - computematchjumps - compute match jumps for BM scan 2310 1.39 christos == static void computematchjumps(struct parse *p, struct re_guts *g); 2311 1.39 christos * 2312 1.39 christos * This algorithm assumes g->must exists and is has size greater than 2313 1.39 christos * zero. It's based on the algorithm found on Computer Algorithms by 2314 1.39 christos * Sara Baase. 2315 1.39 christos * 2316 1.39 christos * A match jump is the number of characters one needs to advance based 2317 1.39 christos * on the already-matched suffix. 2318 1.39 christos * Notice that all values here are minus (g->mlen-1), because of the way 2319 1.39 christos * the search algorithm works. 2320 1.39 christos */ 2321 1.39 christos static void 2322 1.39 christos computematchjumps(struct parse *p, struct re_guts *g) 2323 1.39 christos { 2324 1.39 christos size_t mindex; /* General "must" iterator */ 2325 1.39 christos size_t suffix; /* Keeps track of matching suffix */ 2326 1.39 christos size_t ssuffix; /* Keeps track of suffixes' suffix */ 2327 1.39 christos size_t* pmatches; /* pmatches[k] points to the next i 2328 1.39 christos * such that i+1...mlen is a substring 2329 1.39 christos * of k+1...k+mlen-i-1 2330 1.39 christos */ 2331 1.39 christos 2332 1.39 christos _DIAGASSERT(p != NULL); 2333 1.39 christos _DIAGASSERT(g != NULL); 2334 1.39 christos 2335 1.39 christos /* Avoid making errors worse */ 2336 1.39 christos if (p->error != 0) 2337 1.39 christos return; 2338 1.39 christos 2339 1.39 christos pmatches = calloc(g->mlen, sizeof(*pmatches)); 2340 1.39 christos if (pmatches == NULL) { 2341 1.39 christos g->matchjump = NULL; 2342 1.39 christos return; 2343 1.39 christos } 2344 1.39 christos 2345 1.39 christos g->matchjump = calloc(g->mlen, sizeof(*g->matchjump)); 2346 1.39 christos if (g->matchjump == NULL) { /* Not a fatal error */ 2347 1.39 christos free(pmatches); 2348 1.39 christos return; 2349 1.39 christos } 2350 1.39 christos 2351 1.39 christos /* Set maximum possible jump for each character in the pattern */ 2352 1.39 christos for (mindex = 0; mindex < g->mlen; mindex++) 2353 1.39 christos g->matchjump[mindex] = 2 * g->mlen - mindex - 1; 2354 1.39 christos 2355 1.39 christos /* Compute pmatches[] */ 2356 1.39 christos for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) { 2357 1.39 christos pmatches[mindex] = suffix; 2358 1.39 christos 2359 1.39 christos /* If a mismatch is found, interrupting the substring, 2360 1.39 christos * compute the matchjump for that position. If no 2361 1.39 christos * mismatch is found, then a text substring mismatched 2362 1.39 christos * against the suffix will also mismatch against the 2363 1.39 christos * substring. 2364 1.39 christos */ 2365 1.39 christos while (suffix < g->mlen 2366 1.39 christos && g->must[mindex] != g->must[suffix]) { 2367 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix], 2368 1.39 christos g->mlen - mindex - 1); 2369 1.39 christos suffix = pmatches[suffix]; 2370 1.39 christos } 2371 1.39 christos } 2372 1.39 christos 2373 1.39 christos /* Compute the matchjump up to the last substring found to jump 2374 1.39 christos * to the beginning of the largest must pattern prefix matching 2375 1.39 christos * it's own suffix. 2376 1.39 christos */ 2377 1.39 christos for (mindex = 0; mindex <= suffix; mindex++) 2378 1.39 christos g->matchjump[mindex] = MIN(g->matchjump[mindex], 2379 1.39 christos g->mlen + suffix - mindex); 2380 1.39 christos 2381 1.39 christos ssuffix = pmatches[suffix]; 2382 1.39 christos while (suffix < g->mlen) { 2383 1.39 christos while (suffix <= ssuffix && suffix < g->mlen) { 2384 1.39 christos g->matchjump[suffix] = MIN(g->matchjump[suffix], 2385 1.39 christos g->mlen + ssuffix - suffix); 2386 1.39 christos suffix++; 2387 1.39 christos } 2388 1.39 christos if (suffix < g->mlen) 2389 1.39 christos ssuffix = pmatches[ssuffix]; 2390 1.39 christos } 2391 1.39 christos 2392 1.39 christos free(pmatches); 2393 1.39 christos } 2394 1.39 christos 2395 1.39 christos /* 2396 1.1 jtc - pluscount - count + nesting 2397 1.9 perry == static sopno pluscount(struct parse *p, struct re_guts *g); 2398 1.1 jtc */ 2399 1.1 jtc static sopno /* nesting depth */ 2400 1.39 christos pluscount(struct parse *p, struct re_guts *g) 2401 1.1 jtc { 2402 1.9 perry sop *scan; 2403 1.9 perry sop s; 2404 1.9 perry sopno plusnest = 0; 2405 1.9 perry sopno maxnest = 0; 2406 1.14 lukem 2407 1.14 lukem _DIAGASSERT(p != NULL); 2408 1.14 lukem _DIAGASSERT(g != NULL); 2409 1.1 jtc 2410 1.1 jtc if (p->error != 0) 2411 1.1 jtc return(0); /* there may not be an OEND */ 2412 1.1 jtc 2413 1.1 jtc scan = g->strip + 1; 2414 1.1 jtc do { 2415 1.1 jtc s = *scan++; 2416 1.1 jtc switch (OP(s)) { 2417 1.1 jtc case OPLUS_: 2418 1.1 jtc plusnest++; 2419 1.1 jtc break; 2420 1.1 jtc case O_PLUS: 2421 1.1 jtc if (plusnest > maxnest) 2422 1.1 jtc maxnest = plusnest; 2423 1.1 jtc plusnest--; 2424 1.1 jtc break; 2425 1.1 jtc } 2426 1.1 jtc } while (OP(s) != OEND); 2427 1.1 jtc if (plusnest != 0) 2428 1.1 jtc g->iflags |= BAD; 2429 1.1 jtc return(maxnest); 2430 1.1 jtc } 2431