regcomp.c revision 1.6 1 /* $NetBSD: regcomp.c,v 1.6 1995/02/27 13:29:01 cgd Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
40 */
41
42 #if defined(LIBC_SCCS) && !defined(lint)
43 #if 0
44 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
45 #else
46 static char rcsid[] = "$NetBSD: regcomp.c,v 1.6 1995/02/27 13:29:01 cgd Exp $";
47 #endif
48 #endif /* LIBC_SCCS and not lint */
49
50 #include <sys/types.h>
51 #include <stdio.h>
52 #include <string.h>
53 #include <ctype.h>
54 #include <limits.h>
55 #include <stdlib.h>
56 #include <regex.h>
57
58 #include "utils.h"
59 #include "regex2.h"
60
61 #include "cclass.h"
62 #include "cname.h"
63
64 /*
65 * parse structure, passed up and down to avoid global variables and
66 * other clumsinesses
67 */
68 struct parse {
69 char *next; /* next character in RE */
70 char *end; /* end of string (-> NUL normally) */
71 int error; /* has an error been seen? */
72 sop *strip; /* malloced strip */
73 sopno ssize; /* malloced strip size (allocated) */
74 sopno slen; /* malloced strip length (used) */
75 int ncsalloc; /* number of csets allocated */
76 struct re_guts *g;
77 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
78 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
79 sopno pend[NPAREN]; /* -> ) ([0] unused) */
80 };
81
82 /* ========= begin header generated by ./mkh ========= */
83 #ifdef __cplusplus
84 extern "C" {
85 #endif
86
87 /* === regcomp.c === */
88 static void p_ere __P((struct parse *p, int stop));
89 static void p_ere_exp __P((struct parse *p));
90 static void p_str __P((struct parse *p));
91 static void p_bre __P((struct parse *p, int end1, int end2));
92 static int p_simp_re __P((struct parse *p, int starordinary));
93 static int p_count __P((struct parse *p));
94 static void p_bracket __P((struct parse *p));
95 static void p_b_term __P((struct parse *p, cset *cs));
96 static void p_b_cclass __P((struct parse *p, cset *cs));
97 static void p_b_eclass __P((struct parse *p, cset *cs));
98 static char p_b_symbol __P((struct parse *p));
99 static char p_b_coll_elem __P((struct parse *p, int endc));
100 static char othercase __P((int ch));
101 static void bothcases __P((struct parse *p, int ch));
102 static void ordinary __P((struct parse *p, int ch));
103 static void nonnewline __P((struct parse *p));
104 static void repeat __P((struct parse *p, sopno start, int from, int to));
105 static int seterr __P((struct parse *p, int e));
106 static cset *allocset __P((struct parse *p));
107 static void freeset __P((struct parse *p, cset *cs));
108 static int freezeset __P((struct parse *p, cset *cs));
109 static int firstch __P((struct parse *p, cset *cs));
110 static int nch __P((struct parse *p, cset *cs));
111 static void mcadd __P((struct parse *p, cset *cs, char *cp));
112 static void mcsub __P((cset *cs, char *cp));
113 static int mcin __P((cset *cs, char *cp));
114 static char *mcfind __P((cset *cs, char *cp));
115 static void mcinvert __P((struct parse *p, cset *cs));
116 static void mccase __P((struct parse *p, cset *cs));
117 static int isinsets __P((struct re_guts *g, int c));
118 static int samesets __P((struct re_guts *g, int c1, int c2));
119 static void categorize __P((struct parse *p, struct re_guts *g));
120 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
121 static void doemit __P((struct parse *p, sop op, size_t opnd));
122 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
123 static void dofwd __P((struct parse *p, sopno pos, sop value));
124 static void enlarge __P((struct parse *p, sopno size));
125 static void stripsnug __P((struct parse *p, struct re_guts *g));
126 static void findmust __P((struct parse *p, struct re_guts *g));
127 static sopno pluscount __P((struct parse *p, struct re_guts *g));
128
129 #ifdef __cplusplus
130 }
131 #endif
132 /* ========= end header generated by ./mkh ========= */
133
134 static char nuls[10]; /* place to point scanner in event of error */
135
136 /*
137 * macros for use with parse structure
138 * BEWARE: these know that the parse structure is named `p' !!!
139 */
140 #define PEEK() (*p->next)
141 #define PEEK2() (*(p->next+1))
142 #define MORE() (p->next < p->end)
143 #define MORE2() (p->next+1 < p->end)
144 #define SEE(c) (MORE() && PEEK() == (c))
145 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
146 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
147 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
148 #define NEXT() (p->next++)
149 #define NEXT2() (p->next += 2)
150 #define NEXTn(n) (p->next += (n))
151 #define GETNEXT() (*p->next++)
152 #define SETERROR(e) seterr(p, (e))
153 #define REQUIRE(co, e) ((co) || SETERROR(e))
154 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
155 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
156 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
157 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
158 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
159 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
160 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
161 #define HERE() (p->slen)
162 #define THERE() (p->slen - 1)
163 #define THERETHERE() (p->slen - 2)
164 #define DROP(n) (p->slen -= (n))
165
166 #ifndef NDEBUG
167 static int never = 0; /* for use in asserts; shuts lint up */
168 #else
169 #define never 0 /* some <assert.h>s have bugs too */
170 #endif
171
172 /*
173 - regcomp - interface for parser and compilation
174 = extern int regcomp(regex_t *, const char *, int);
175 = #define REG_BASIC 0000
176 = #define REG_EXTENDED 0001
177 = #define REG_ICASE 0002
178 = #define REG_NOSUB 0004
179 = #define REG_NEWLINE 0010
180 = #define REG_NOSPEC 0020
181 = #define REG_PEND 0040
182 = #define REG_DUMP 0200
183 */
184 int /* 0 success, otherwise REG_something */
185 regcomp(preg, pattern, cflags)
186 regex_t *preg;
187 const char *pattern;
188 int cflags;
189 {
190 struct parse pa;
191 register struct re_guts *g;
192 register struct parse *p = &pa;
193 register int i;
194 register size_t len;
195 #ifdef REDEBUG
196 # define GOODFLAGS(f) (f)
197 #else
198 # define GOODFLAGS(f) ((f)&~REG_DUMP)
199 #endif
200
201 cflags = GOODFLAGS(cflags);
202 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
203 return(REG_INVARG);
204
205 if (cflags®_PEND) {
206 if (preg->re_endp < pattern)
207 return(REG_INVARG);
208 len = preg->re_endp - pattern;
209 } else
210 len = strlen((char *)pattern);
211
212 /* do the mallocs early so failure handling is easy */
213 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
214 (NC-1)*sizeof(cat_t));
215 if (g == NULL)
216 return(REG_ESPACE);
217 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
218 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
219 p->slen = 0;
220 if (p->strip == NULL) {
221 free((char *)g);
222 return(REG_ESPACE);
223 }
224
225 /* set things up */
226 p->g = g;
227 p->next = (char *)pattern; /* convenience; we do not modify it */
228 p->end = p->next + len;
229 p->error = 0;
230 p->ncsalloc = 0;
231 for (i = 0; i < NPAREN; i++) {
232 p->pbegin[i] = 0;
233 p->pend[i] = 0;
234 }
235 g->csetsize = NC;
236 g->sets = NULL;
237 g->setbits = NULL;
238 g->ncsets = 0;
239 g->cflags = cflags;
240 g->iflags = 0;
241 g->nbol = 0;
242 g->neol = 0;
243 g->must = NULL;
244 g->mlen = 0;
245 g->nsub = 0;
246 g->ncategories = 1; /* category 0 is "everything else" */
247 g->categories = &g->catspace[-(CHAR_MIN)];
248 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
249 g->backrefs = 0;
250
251 /* do it */
252 EMIT(OEND, 0);
253 g->firststate = THERE();
254 if (cflags®_EXTENDED)
255 p_ere(p, OUT);
256 else if (cflags®_NOSPEC)
257 p_str(p);
258 else
259 p_bre(p, OUT, OUT);
260 EMIT(OEND, 0);
261 g->laststate = THERE();
262
263 /* tidy up loose ends and fill things in */
264 categorize(p, g);
265 stripsnug(p, g);
266 findmust(p, g);
267 g->nplus = pluscount(p, g);
268 g->magic = MAGIC2;
269 preg->re_nsub = g->nsub;
270 preg->re_g = g;
271 preg->re_magic = MAGIC1;
272 #ifndef REDEBUG
273 /* not debugging, so can't rely on the assert() in regexec() */
274 if (g->iflags&BAD)
275 SETERROR(REG_ASSERT);
276 #endif
277
278 /* win or lose, we're done */
279 if (p->error != 0) /* lose */
280 regfree(preg);
281 return(p->error);
282 }
283
284 /*
285 - p_ere - ERE parser top level, concatenation and alternation
286 == static void p_ere(register struct parse *p, int stop);
287 */
288 static void
289 p_ere(p, stop)
290 register struct parse *p;
291 int stop; /* character this ERE should end at */
292 {
293 register char c;
294 register sopno prevback;
295 register sopno prevfwd;
296 register sopno conc;
297 register int first = 1; /* is this the first alternative? */
298
299 for (;;) {
300 /* do a bunch of concatenated expressions */
301 conc = HERE();
302 while (MORE() && (c = PEEK()) != '|' && c != stop)
303 p_ere_exp(p);
304 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
305
306 if (!EAT('|'))
307 break; /* NOTE BREAK OUT */
308
309 if (first) {
310 INSERT(OCH_, conc); /* offset is wrong */
311 prevfwd = conc;
312 prevback = conc;
313 first = 0;
314 }
315 ASTERN(OOR1, prevback);
316 prevback = THERE();
317 AHEAD(prevfwd); /* fix previous offset */
318 prevfwd = HERE();
319 EMIT(OOR2, 0); /* offset is very wrong */
320 }
321
322 if (!first) { /* tail-end fixups */
323 AHEAD(prevfwd);
324 ASTERN(O_CH, prevback);
325 }
326
327 assert(!MORE() || SEE(stop));
328 }
329
330 /*
331 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
332 == static void p_ere_exp(register struct parse *p);
333 */
334 static void
335 p_ere_exp(p)
336 register struct parse *p;
337 {
338 register char c;
339 register sopno pos;
340 register int count;
341 register int count2;
342 register sopno subno;
343 int wascaret = 0;
344
345 assert(MORE()); /* caller should have ensured this */
346 c = GETNEXT();
347
348 pos = HERE();
349 switch (c) {
350 case '(':
351 REQUIRE(MORE(), REG_EPAREN);
352 p->g->nsub++;
353 subno = p->g->nsub;
354 if (subno < NPAREN)
355 p->pbegin[subno] = HERE();
356 EMIT(OLPAREN, subno);
357 if (!SEE(')'))
358 p_ere(p, ')');
359 if (subno < NPAREN) {
360 p->pend[subno] = HERE();
361 assert(p->pend[subno] != 0);
362 }
363 EMIT(ORPAREN, subno);
364 MUSTEAT(')', REG_EPAREN);
365 break;
366 #ifndef POSIX_MISTAKE
367 case ')': /* happens only if no current unmatched ( */
368 /*
369 * You may ask, why the ifndef? Because I didn't notice
370 * this until slightly too late for 1003.2, and none of the
371 * other 1003.2 regular-expression reviewers noticed it at
372 * all. So an unmatched ) is legal POSIX, at least until
373 * we can get it fixed.
374 */
375 SETERROR(REG_EPAREN);
376 break;
377 #endif
378 case '^':
379 EMIT(OBOL, 0);
380 p->g->iflags |= USEBOL;
381 p->g->nbol++;
382 wascaret = 1;
383 break;
384 case '$':
385 EMIT(OEOL, 0);
386 p->g->iflags |= USEEOL;
387 p->g->neol++;
388 break;
389 case '|':
390 SETERROR(REG_EMPTY);
391 break;
392 case '*':
393 case '+':
394 case '?':
395 SETERROR(REG_BADRPT);
396 break;
397 case '.':
398 if (p->g->cflags®_NEWLINE)
399 nonnewline(p);
400 else
401 EMIT(OANY, 0);
402 break;
403 case '[':
404 p_bracket(p);
405 break;
406 case '\\':
407 REQUIRE(MORE(), REG_EESCAPE);
408 c = GETNEXT();
409 ordinary(p, c);
410 break;
411 case '{': /* okay as ordinary except if digit follows */
412 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
413 /* FALLTHROUGH */
414 default:
415 ordinary(p, c);
416 break;
417 }
418
419 if (!MORE())
420 return;
421 c = PEEK();
422 /* we call { a repetition if followed by a digit */
423 if (!( c == '*' || c == '+' || c == '?' ||
424 (c == '{' && MORE2() && isdigit(PEEK2())) ))
425 return; /* no repetition, we're done */
426 NEXT();
427
428 REQUIRE(!wascaret, REG_BADRPT);
429 switch (c) {
430 case '*': /* implemented as +? */
431 /* this case does not require the (y|) trick, noKLUDGE */
432 INSERT(OPLUS_, pos);
433 ASTERN(O_PLUS, pos);
434 INSERT(OQUEST_, pos);
435 ASTERN(O_QUEST, pos);
436 break;
437 case '+':
438 INSERT(OPLUS_, pos);
439 ASTERN(O_PLUS, pos);
440 break;
441 case '?':
442 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
443 INSERT(OCH_, pos); /* offset slightly wrong */
444 ASTERN(OOR1, pos); /* this one's right */
445 AHEAD(pos); /* fix the OCH_ */
446 EMIT(OOR2, 0); /* offset very wrong... */
447 AHEAD(THERE()); /* ...so fix it */
448 ASTERN(O_CH, THERETHERE());
449 break;
450 case '{':
451 count = p_count(p);
452 if (EAT(',')) {
453 if (isdigit(PEEK())) {
454 count2 = p_count(p);
455 REQUIRE(count <= count2, REG_BADBR);
456 } else /* single number with comma */
457 count2 = INFINITY;
458 } else /* just a single number */
459 count2 = count;
460 repeat(p, pos, count, count2);
461 if (!EAT('}')) { /* error heuristics */
462 while (MORE() && PEEK() != '}')
463 NEXT();
464 REQUIRE(MORE(), REG_EBRACE);
465 SETERROR(REG_BADBR);
466 }
467 break;
468 }
469
470 if (!MORE())
471 return;
472 c = PEEK();
473 if (!( c == '*' || c == '+' || c == '?' ||
474 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
475 return;
476 SETERROR(REG_BADRPT);
477 }
478
479 /*
480 - p_str - string (no metacharacters) "parser"
481 == static void p_str(register struct parse *p);
482 */
483 static void
484 p_str(p)
485 register struct parse *p;
486 {
487 REQUIRE(MORE(), REG_EMPTY);
488 while (MORE())
489 ordinary(p, GETNEXT());
490 }
491
492 /*
493 - p_bre - BRE parser top level, anchoring and concatenation
494 == static void p_bre(register struct parse *p, register int end1, \
495 == register int end2);
496 * Giving end1 as OUT essentially eliminates the end1/end2 check.
497 *
498 * This implementation is a bit of a kludge, in that a trailing $ is first
499 * taken as an ordinary character and then revised to be an anchor. The
500 * only undesirable side effect is that '$' gets included as a character
501 * category in such cases. This is fairly harmless; not worth fixing.
502 * The amount of lookahead needed to avoid this kludge is excessive.
503 */
504 static void
505 p_bre(p, end1, end2)
506 register struct parse *p;
507 register int end1; /* first terminating character */
508 register int end2; /* second terminating character */
509 {
510 register sopno start = HERE();
511 register int first = 1; /* first subexpression? */
512 register int wasdollar = 0;
513
514 if (EAT('^')) {
515 EMIT(OBOL, 0);
516 p->g->iflags |= USEBOL;
517 p->g->nbol++;
518 }
519 while (MORE() && !SEETWO(end1, end2)) {
520 wasdollar = p_simp_re(p, first);
521 first = 0;
522 }
523 if (wasdollar) { /* oops, that was a trailing anchor */
524 DROP(1);
525 EMIT(OEOL, 0);
526 p->g->iflags |= USEEOL;
527 p->g->neol++;
528 }
529
530 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
531 }
532
533 /*
534 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
535 == static int p_simp_re(register struct parse *p, int starordinary);
536 */
537 static int /* was the simple RE an unbackslashed $? */
538 p_simp_re(p, starordinary)
539 register struct parse *p;
540 int starordinary; /* is a leading * an ordinary character? */
541 {
542 register int c;
543 register int count;
544 register int count2;
545 register sopno pos;
546 register int i;
547 register sopno subno;
548 # define BACKSL (1<<CHAR_BIT)
549
550 pos = HERE(); /* repetion op, if any, covers from here */
551
552 assert(MORE()); /* caller should have ensured this */
553 c = GETNEXT();
554 if (c == '\\') {
555 REQUIRE(MORE(), REG_EESCAPE);
556 c = BACKSL | (unsigned char)GETNEXT();
557 }
558 switch (c) {
559 case '.':
560 if (p->g->cflags®_NEWLINE)
561 nonnewline(p);
562 else
563 EMIT(OANY, 0);
564 break;
565 case '[':
566 p_bracket(p);
567 break;
568 case BACKSL|'{':
569 SETERROR(REG_BADRPT);
570 break;
571 case BACKSL|'(':
572 p->g->nsub++;
573 subno = p->g->nsub;
574 if (subno < NPAREN)
575 p->pbegin[subno] = HERE();
576 EMIT(OLPAREN, subno);
577 /* the MORE here is an error heuristic */
578 if (MORE() && !SEETWO('\\', ')'))
579 p_bre(p, '\\', ')');
580 if (subno < NPAREN) {
581 p->pend[subno] = HERE();
582 assert(p->pend[subno] != 0);
583 }
584 EMIT(ORPAREN, subno);
585 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
586 break;
587 case BACKSL|')': /* should not get here -- must be user */
588 case BACKSL|'}':
589 SETERROR(REG_EPAREN);
590 break;
591 case BACKSL|'1':
592 case BACKSL|'2':
593 case BACKSL|'3':
594 case BACKSL|'4':
595 case BACKSL|'5':
596 case BACKSL|'6':
597 case BACKSL|'7':
598 case BACKSL|'8':
599 case BACKSL|'9':
600 i = (c&~BACKSL) - '0';
601 assert(i < NPAREN);
602 if (p->pend[i] != 0) {
603 assert(i <= p->g->nsub);
604 EMIT(OBACK_, i);
605 assert(p->pbegin[i] != 0);
606 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
607 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
608 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
609 EMIT(O_BACK, i);
610 } else
611 SETERROR(REG_ESUBREG);
612 p->g->backrefs = 1;
613 break;
614 case '*':
615 REQUIRE(starordinary, REG_BADRPT);
616 /* FALLTHROUGH */
617 default:
618 ordinary(p, c &~ BACKSL);
619 break;
620 }
621
622 if (EAT('*')) { /* implemented as +? */
623 /* this case does not require the (y|) trick, noKLUDGE */
624 INSERT(OPLUS_, pos);
625 ASTERN(O_PLUS, pos);
626 INSERT(OQUEST_, pos);
627 ASTERN(O_QUEST, pos);
628 } else if (EATTWO('\\', '{')) {
629 count = p_count(p);
630 if (EAT(',')) {
631 if (MORE() && isdigit(PEEK())) {
632 count2 = p_count(p);
633 REQUIRE(count <= count2, REG_BADBR);
634 } else /* single number with comma */
635 count2 = INFINITY;
636 } else /* just a single number */
637 count2 = count;
638 repeat(p, pos, count, count2);
639 if (!EATTWO('\\', '}')) { /* error heuristics */
640 while (MORE() && !SEETWO('\\', '}'))
641 NEXT();
642 REQUIRE(MORE(), REG_EBRACE);
643 SETERROR(REG_BADBR);
644 }
645 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
646 return(1);
647
648 return(0);
649 }
650
651 /*
652 - p_count - parse a repetition count
653 == static int p_count(register struct parse *p);
654 */
655 static int /* the value */
656 p_count(p)
657 register struct parse *p;
658 {
659 register int count = 0;
660 register int ndigits = 0;
661
662 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
663 count = count*10 + (GETNEXT() - '0');
664 ndigits++;
665 }
666
667 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
668 return(count);
669 }
670
671 /*
672 - p_bracket - parse a bracketed character list
673 == static void p_bracket(register struct parse *p);
674 *
675 * Note a significant property of this code: if the allocset() did SETERROR,
676 * no set operations are done.
677 */
678 static void
679 p_bracket(p)
680 register struct parse *p;
681 {
682 register char c;
683 register cset *cs = allocset(p);
684 register int invert = 0;
685
686 /* Dept of Truly Sickening Special-Case Kludges */
687 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
688 EMIT(OBOW, 0);
689 NEXTn(6);
690 return;
691 }
692 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
693 EMIT(OEOW, 0);
694 NEXTn(6);
695 return;
696 }
697
698 if (EAT('^'))
699 invert++; /* make note to invert set at end */
700 if (EAT(']'))
701 CHadd(cs, ']');
702 else if (EAT('-'))
703 CHadd(cs, '-');
704 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
705 p_b_term(p, cs);
706 if (EAT('-'))
707 CHadd(cs, '-');
708 MUSTEAT(']', REG_EBRACK);
709
710 if (p->error != 0) /* don't mess things up further */
711 return;
712
713 if (p->g->cflags®_ICASE) {
714 register int i;
715 register int ci;
716
717 for (i = p->g->csetsize - 1; i >= 0; i--)
718 if (CHIN(cs, i) && isalpha(i)) {
719 ci = othercase(i);
720 if (ci != i)
721 CHadd(cs, ci);
722 }
723 if (cs->multis != NULL)
724 mccase(p, cs);
725 }
726 if (invert) {
727 register int i;
728
729 for (i = p->g->csetsize - 1; i >= 0; i--)
730 if (CHIN(cs, i))
731 CHsub(cs, i);
732 else
733 CHadd(cs, i);
734 if (p->g->cflags®_NEWLINE)
735 CHsub(cs, '\n');
736 if (cs->multis != NULL)
737 mcinvert(p, cs);
738 }
739
740 assert(cs->multis == NULL); /* xxx */
741
742 if (nch(p, cs) == 1) { /* optimize singleton sets */
743 ordinary(p, firstch(p, cs));
744 freeset(p, cs);
745 } else
746 EMIT(OANYOF, freezeset(p, cs));
747 }
748
749 /*
750 - p_b_term - parse one term of a bracketed character list
751 == static void p_b_term(register struct parse *p, register cset *cs);
752 */
753 static void
754 p_b_term(p, cs)
755 register struct parse *p;
756 register cset *cs;
757 {
758 register char c;
759 register char start, finish;
760 register int i;
761
762 /* classify what we've got */
763 switch ((MORE()) ? PEEK() : '\0') {
764 case '[':
765 c = (MORE2()) ? PEEK2() : '\0';
766 break;
767 case '-':
768 SETERROR(REG_ERANGE);
769 return; /* NOTE RETURN */
770 break;
771 default:
772 c = '\0';
773 break;
774 }
775
776 switch (c) {
777 case ':': /* character class */
778 NEXT2();
779 REQUIRE(MORE(), REG_EBRACK);
780 c = PEEK();
781 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
782 p_b_cclass(p, cs);
783 REQUIRE(MORE(), REG_EBRACK);
784 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
785 break;
786 case '=': /* equivalence class */
787 NEXT2();
788 REQUIRE(MORE(), REG_EBRACK);
789 c = PEEK();
790 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
791 p_b_eclass(p, cs);
792 REQUIRE(MORE(), REG_EBRACK);
793 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
794 break;
795 default: /* symbol, ordinary character, or range */
796 /* xxx revision needed for multichar stuff */
797 start = p_b_symbol(p);
798 if (SEE('-') && MORE2() && PEEK2() != ']') {
799 /* range */
800 NEXT();
801 if (EAT('-'))
802 finish = '-';
803 else
804 finish = p_b_symbol(p);
805 } else
806 finish = start;
807 /* xxx what about signed chars here... */
808 REQUIRE(start <= finish, REG_ERANGE);
809 for (i = start; i <= finish; i++)
810 CHadd(cs, i);
811 break;
812 }
813 }
814
815 /*
816 - p_b_cclass - parse a character-class name and deal with it
817 == static void p_b_cclass(register struct parse *p, register cset *cs);
818 */
819 static void
820 p_b_cclass(p, cs)
821 register struct parse *p;
822 register cset *cs;
823 {
824 register char *sp = p->next;
825 register struct cclass *cp;
826 register size_t len;
827 register char *u;
828 register char c;
829
830 while (MORE() && isalpha(PEEK()))
831 NEXT();
832 len = p->next - sp;
833 for (cp = cclasses; cp->name != NULL; cp++)
834 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
835 break;
836 if (cp->name == NULL) {
837 /* oops, didn't find it */
838 SETERROR(REG_ECTYPE);
839 return;
840 }
841
842 u = cp->chars;
843 while ((c = *u++) != '\0')
844 CHadd(cs, c);
845 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
846 MCadd(p, cs, u);
847 }
848
849 /*
850 - p_b_eclass - parse an equivalence-class name and deal with it
851 == static void p_b_eclass(register struct parse *p, register cset *cs);
852 *
853 * This implementation is incomplete. xxx
854 */
855 static void
856 p_b_eclass(p, cs)
857 register struct parse *p;
858 register cset *cs;
859 {
860 register char c;
861
862 c = p_b_coll_elem(p, '=');
863 CHadd(cs, c);
864 }
865
866 /*
867 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
868 == static char p_b_symbol(register struct parse *p);
869 */
870 static char /* value of symbol */
871 p_b_symbol(p)
872 register struct parse *p;
873 {
874 register char value;
875
876 REQUIRE(MORE(), REG_EBRACK);
877 if (!EATTWO('[', '.'))
878 return(GETNEXT());
879
880 /* collating symbol */
881 value = p_b_coll_elem(p, '.');
882 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
883 return(value);
884 }
885
886 /*
887 - p_b_coll_elem - parse a collating-element name and look it up
888 == static char p_b_coll_elem(register struct parse *p, int endc);
889 */
890 static char /* value of collating element */
891 p_b_coll_elem(p, endc)
892 register struct parse *p;
893 int endc; /* name ended by endc,']' */
894 {
895 register char *sp = p->next;
896 register struct cname *cp;
897 register int len;
898 register char c;
899
900 while (MORE() && !SEETWO(endc, ']'))
901 NEXT();
902 if (!MORE()) {
903 SETERROR(REG_EBRACK);
904 return(0);
905 }
906 len = p->next - sp;
907 for (cp = cnames; cp->name != NULL; cp++)
908 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
909 return(cp->code); /* known name */
910 if (len == 1)
911 return(*sp); /* single character */
912 SETERROR(REG_ECOLLATE); /* neither */
913 return(0);
914 }
915
916 /*
917 - othercase - return the case counterpart of an alphabetic
918 == static char othercase(int ch);
919 */
920 static char /* if no counterpart, return ch */
921 othercase(ch)
922 int ch;
923 {
924 assert(isalpha(ch));
925 if (isupper(ch))
926 return(tolower(ch));
927 else if (islower(ch))
928 return(toupper(ch));
929 else /* peculiar, but could happen */
930 return(ch);
931 }
932
933 /*
934 - bothcases - emit a dualcase version of a two-case character
935 == static void bothcases(register struct parse *p, int ch);
936 *
937 * Boy, is this implementation ever a kludge...
938 */
939 static void
940 bothcases(p, ch)
941 register struct parse *p;
942 int ch;
943 {
944 register char *oldnext = p->next;
945 register char *oldend = p->end;
946 char bracket[3];
947
948 assert(othercase(ch) != ch); /* p_bracket() would recurse */
949 p->next = bracket;
950 p->end = bracket+2;
951 bracket[0] = ch;
952 bracket[1] = ']';
953 bracket[2] = '\0';
954 p_bracket(p);
955 assert(p->next == bracket+2);
956 p->next = oldnext;
957 p->end = oldend;
958 }
959
960 /*
961 - ordinary - emit an ordinary character
962 == static void ordinary(register struct parse *p, register int ch);
963 */
964 static void
965 ordinary(p, ch)
966 register struct parse *p;
967 register int ch;
968 {
969 register cat_t *cap = p->g->categories;
970
971 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
972 bothcases(p, ch);
973 else {
974 EMIT(OCHAR, (unsigned char)ch);
975 if (cap[ch] == 0)
976 cap[ch] = p->g->ncategories++;
977 }
978 }
979
980 /*
981 - nonnewline - emit REG_NEWLINE version of OANY
982 == static void nonnewline(register struct parse *p);
983 *
984 * Boy, is this implementation ever a kludge...
985 */
986 static void
987 nonnewline(p)
988 register struct parse *p;
989 {
990 register char *oldnext = p->next;
991 register char *oldend = p->end;
992 char bracket[4];
993
994 p->next = bracket;
995 p->end = bracket+3;
996 bracket[0] = '^';
997 bracket[1] = '\n';
998 bracket[2] = ']';
999 bracket[3] = '\0';
1000 p_bracket(p);
1001 assert(p->next == bracket+3);
1002 p->next = oldnext;
1003 p->end = oldend;
1004 }
1005
1006 /*
1007 - repeat - generate code for a bounded repetition, recursively if needed
1008 == static void repeat(register struct parse *p, sopno start, int from, int to);
1009 */
1010 static void
1011 repeat(p, start, from, to)
1012 register struct parse *p;
1013 sopno start; /* operand from here to end of strip */
1014 int from; /* repeated from this number */
1015 int to; /* to this number of times (maybe INFINITY) */
1016 {
1017 register sopno finish = HERE();
1018 # define N 2
1019 # define INF 3
1020 # define REP(f, t) ((f)*8 + (t))
1021 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1022 register sopno copy;
1023
1024 if (p->error != 0) /* head off possible runaway recursion */
1025 return;
1026
1027 assert(from <= to);
1028
1029 switch (REP(MAP(from), MAP(to))) {
1030 case REP(0, 0): /* must be user doing this */
1031 DROP(finish-start); /* drop the operand */
1032 break;
1033 case REP(0, 1): /* as x{1,1}? */
1034 case REP(0, N): /* as x{1,n}? */
1035 case REP(0, INF): /* as x{1,}? */
1036 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1037 INSERT(OCH_, start); /* offset is wrong... */
1038 repeat(p, start+1, 1, to);
1039 ASTERN(OOR1, start);
1040 AHEAD(start); /* ... fix it */
1041 EMIT(OOR2, 0);
1042 AHEAD(THERE());
1043 ASTERN(O_CH, THERETHERE());
1044 break;
1045 case REP(1, 1): /* trivial case */
1046 /* done */
1047 break;
1048 case REP(1, N): /* as x?x{1,n-1} */
1049 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1050 INSERT(OCH_, start);
1051 ASTERN(OOR1, start);
1052 AHEAD(start);
1053 EMIT(OOR2, 0); /* offset very wrong... */
1054 AHEAD(THERE()); /* ...so fix it */
1055 ASTERN(O_CH, THERETHERE());
1056 copy = dupl(p, start+1, finish+1);
1057 assert(copy == finish+4);
1058 repeat(p, copy, 1, to-1);
1059 break;
1060 case REP(1, INF): /* as x+ */
1061 INSERT(OPLUS_, start);
1062 ASTERN(O_PLUS, start);
1063 break;
1064 case REP(N, N): /* as xx{m-1,n-1} */
1065 copy = dupl(p, start, finish);
1066 repeat(p, copy, from-1, to-1);
1067 break;
1068 case REP(N, INF): /* as xx{n-1,INF} */
1069 copy = dupl(p, start, finish);
1070 repeat(p, copy, from-1, to);
1071 break;
1072 default: /* "can't happen" */
1073 SETERROR(REG_ASSERT); /* just in case */
1074 break;
1075 }
1076 }
1077
1078 /*
1079 - seterr - set an error condition
1080 == static int seterr(register struct parse *p, int e);
1081 */
1082 static int /* useless but makes type checking happy */
1083 seterr(p, e)
1084 register struct parse *p;
1085 int e;
1086 {
1087 if (p->error == 0) /* keep earliest error condition */
1088 p->error = e;
1089 p->next = nuls; /* try to bring things to a halt */
1090 p->end = nuls;
1091 return(0); /* make the return value well-defined */
1092 }
1093
1094 /*
1095 - allocset - allocate a set of characters for []
1096 == static cset *allocset(register struct parse *p);
1097 */
1098 static cset *
1099 allocset(p)
1100 register struct parse *p;
1101 {
1102 register int no = p->g->ncsets++;
1103 register size_t nc;
1104 register size_t nbytes;
1105 register cset *cs;
1106 register size_t css = (size_t)p->g->csetsize;
1107 register int i;
1108
1109 if (no >= p->ncsalloc) { /* need another column of space */
1110 p->ncsalloc += CHAR_BIT;
1111 nc = p->ncsalloc;
1112 assert(nc % CHAR_BIT == 0);
1113 nbytes = nc / CHAR_BIT * css;
1114 if (p->g->sets == NULL)
1115 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1116 else
1117 p->g->sets = (cset *)realloc((char *)p->g->sets,
1118 nc * sizeof(cset));
1119 if (p->g->setbits == NULL)
1120 p->g->setbits = (uch *)malloc(nbytes);
1121 else {
1122 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1123 nbytes);
1124 /* xxx this isn't right if setbits is now NULL */
1125 for (i = 0; i < no; i++)
1126 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1127 }
1128 if (p->g->sets != NULL && p->g->setbits != NULL)
1129 (void) memset((char *)p->g->setbits + (nbytes - css),
1130 0, css);
1131 else {
1132 no = 0;
1133 SETERROR(REG_ESPACE);
1134 /* caller's responsibility not to do set ops */
1135 }
1136 }
1137
1138 assert(p->g->sets != NULL); /* xxx */
1139 cs = &p->g->sets[no];
1140 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1141 cs->mask = 1 << ((no) % CHAR_BIT);
1142 cs->hash = 0;
1143 cs->smultis = 0;
1144 cs->multis = NULL;
1145
1146 return(cs);
1147 }
1148
1149 /*
1150 - freeset - free a now-unused set
1151 == static void freeset(register struct parse *p, register cset *cs);
1152 */
1153 static void
1154 freeset(p, cs)
1155 register struct parse *p;
1156 register cset *cs;
1157 {
1158 register int i;
1159 register cset *top = &p->g->sets[p->g->ncsets];
1160 register size_t css = (size_t)p->g->csetsize;
1161
1162 for (i = 0; i < css; i++)
1163 CHsub(cs, i);
1164 if (cs == top-1) /* recover only the easy case */
1165 p->g->ncsets--;
1166 }
1167
1168 /*
1169 - freezeset - final processing on a set of characters
1170 == static int freezeset(register struct parse *p, register cset *cs);
1171 *
1172 * The main task here is merging identical sets. This is usually a waste
1173 * of time (although the hash code minimizes the overhead), but can win
1174 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1175 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1176 * the same value!
1177 */
1178 static int /* set number */
1179 freezeset(p, cs)
1180 register struct parse *p;
1181 register cset *cs;
1182 {
1183 register uch h = cs->hash;
1184 register int i;
1185 register cset *top = &p->g->sets[p->g->ncsets];
1186 register cset *cs2;
1187 register size_t css = (size_t)p->g->csetsize;
1188
1189 /* look for an earlier one which is the same */
1190 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1191 if (cs2->hash == h && cs2 != cs) {
1192 /* maybe */
1193 for (i = 0; i < css; i++)
1194 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1195 break; /* no */
1196 if (i == css)
1197 break; /* yes */
1198 }
1199
1200 if (cs2 < top) { /* found one */
1201 freeset(p, cs);
1202 cs = cs2;
1203 }
1204
1205 return((int)(cs - p->g->sets));
1206 }
1207
1208 /*
1209 - firstch - return first character in a set (which must have at least one)
1210 == static int firstch(register struct parse *p, register cset *cs);
1211 */
1212 static int /* character; there is no "none" value */
1213 firstch(p, cs)
1214 register struct parse *p;
1215 register cset *cs;
1216 {
1217 register int i;
1218 register size_t css = (size_t)p->g->csetsize;
1219
1220 for (i = 0; i < css; i++)
1221 if (CHIN(cs, i))
1222 return((char)i);
1223 assert(never);
1224 return(0); /* arbitrary */
1225 }
1226
1227 /*
1228 - nch - number of characters in a set
1229 == static int nch(register struct parse *p, register cset *cs);
1230 */
1231 static int
1232 nch(p, cs)
1233 register struct parse *p;
1234 register cset *cs;
1235 {
1236 register int i;
1237 register size_t css = (size_t)p->g->csetsize;
1238 register int n = 0;
1239
1240 for (i = 0; i < css; i++)
1241 if (CHIN(cs, i))
1242 n++;
1243 return(n);
1244 }
1245
1246 /*
1247 - mcadd - add a collating element to a cset
1248 == static void mcadd(register struct parse *p, register cset *cs, \
1249 == register char *cp);
1250 */
1251 static void
1252 mcadd(p, cs, cp)
1253 register struct parse *p;
1254 register cset *cs;
1255 register char *cp;
1256 {
1257 register size_t oldend = cs->smultis;
1258
1259 cs->smultis += strlen(cp) + 1;
1260 if (cs->multis == NULL)
1261 cs->multis = malloc(cs->smultis);
1262 else
1263 cs->multis = realloc(cs->multis, cs->smultis);
1264 if (cs->multis == NULL) {
1265 SETERROR(REG_ESPACE);
1266 return;
1267 }
1268
1269 (void) strcpy(cs->multis + oldend - 1, cp);
1270 cs->multis[cs->smultis - 1] = '\0';
1271 }
1272
1273 /*
1274 - mcsub - subtract a collating element from a cset
1275 == static void mcsub(register cset *cs, register char *cp);
1276 */
1277 static void
1278 mcsub(cs, cp)
1279 register cset *cs;
1280 register char *cp;
1281 {
1282 register char *fp = mcfind(cs, cp);
1283 register size_t len = strlen(fp);
1284
1285 assert(fp != NULL);
1286 (void) memmove(fp, fp + len + 1,
1287 cs->smultis - (fp + len + 1 - cs->multis));
1288 cs->smultis -= len;
1289
1290 if (cs->smultis == 0) {
1291 free(cs->multis);
1292 cs->multis = NULL;
1293 return;
1294 }
1295
1296 cs->multis = realloc(cs->multis, cs->smultis);
1297 assert(cs->multis != NULL);
1298 }
1299
1300 /*
1301 - mcin - is a collating element in a cset?
1302 == static int mcin(register cset *cs, register char *cp);
1303 */
1304 static int
1305 mcin(cs, cp)
1306 register cset *cs;
1307 register char *cp;
1308 {
1309 return(mcfind(cs, cp) != NULL);
1310 }
1311
1312 /*
1313 - mcfind - find a collating element in a cset
1314 == static char *mcfind(register cset *cs, register char *cp);
1315 */
1316 static char *
1317 mcfind(cs, cp)
1318 register cset *cs;
1319 register char *cp;
1320 {
1321 register char *p;
1322
1323 if (cs->multis == NULL)
1324 return(NULL);
1325 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1326 if (strcmp(cp, p) == 0)
1327 return(p);
1328 return(NULL);
1329 }
1330
1331 /*
1332 - mcinvert - invert the list of collating elements in a cset
1333 == static void mcinvert(register struct parse *p, register cset *cs);
1334 *
1335 * This would have to know the set of possibilities. Implementation
1336 * is deferred.
1337 */
1338 static void
1339 mcinvert(p, cs)
1340 register struct parse *p;
1341 register cset *cs;
1342 {
1343 assert(cs->multis == NULL); /* xxx */
1344 }
1345
1346 /*
1347 - mccase - add case counterparts of the list of collating elements in a cset
1348 == static void mccase(register struct parse *p, register cset *cs);
1349 *
1350 * This would have to know the set of possibilities. Implementation
1351 * is deferred.
1352 */
1353 static void
1354 mccase(p, cs)
1355 register struct parse *p;
1356 register cset *cs;
1357 {
1358 assert(cs->multis == NULL); /* xxx */
1359 }
1360
1361 /*
1362 - isinsets - is this character in any sets?
1363 == static int isinsets(register struct re_guts *g, int c);
1364 */
1365 static int /* predicate */
1366 isinsets(g, c)
1367 register struct re_guts *g;
1368 int c;
1369 {
1370 register uch *col;
1371 register int i;
1372 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1373 register unsigned uc = (unsigned char)c;
1374
1375 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1376 if (col[uc] != 0)
1377 return(1);
1378 return(0);
1379 }
1380
1381 /*
1382 - samesets - are these two characters in exactly the same sets?
1383 == static int samesets(register struct re_guts *g, int c1, int c2);
1384 */
1385 static int /* predicate */
1386 samesets(g, c1, c2)
1387 register struct re_guts *g;
1388 int c1;
1389 int c2;
1390 {
1391 register uch *col;
1392 register int i;
1393 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1394 register unsigned uc1 = (unsigned char)c1;
1395 register unsigned uc2 = (unsigned char)c2;
1396
1397 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1398 if (col[uc1] != col[uc2])
1399 return(0);
1400 return(1);
1401 }
1402
1403 /*
1404 - categorize - sort out character categories
1405 == static void categorize(struct parse *p, register struct re_guts *g);
1406 */
1407 static void
1408 categorize(p, g)
1409 struct parse *p;
1410 register struct re_guts *g;
1411 {
1412 register cat_t *cats = g->categories;
1413 register int c;
1414 register int c2;
1415 register cat_t cat;
1416
1417 /* avoid making error situations worse */
1418 if (p->error != 0)
1419 return;
1420
1421 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1422 if (cats[c] == 0 && isinsets(g, c)) {
1423 cat = g->ncategories++;
1424 cats[c] = cat;
1425 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1426 if (cats[c2] == 0 && samesets(g, c, c2))
1427 cats[c2] = cat;
1428 }
1429 }
1430
1431 /*
1432 - dupl - emit a duplicate of a bunch of sops
1433 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1434 */
1435 static sopno /* start of duplicate */
1436 dupl(p, start, finish)
1437 register struct parse *p;
1438 sopno start; /* from here */
1439 sopno finish; /* to this less one */
1440 {
1441 register sopno ret = HERE();
1442 register sopno len = finish - start;
1443
1444 assert(finish >= start);
1445 if (len == 0)
1446 return(ret);
1447 enlarge(p, p->ssize + len); /* this many unexpected additions */
1448 assert(p->ssize >= p->slen + len);
1449 (void) memcpy((char *)(p->strip + p->slen),
1450 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1451 p->slen += len;
1452 return(ret);
1453 }
1454
1455 /*
1456 - doemit - emit a strip operator
1457 == static void doemit(register struct parse *p, sop op, size_t opnd);
1458 *
1459 * It might seem better to implement this as a macro with a function as
1460 * hard-case backup, but it's just too big and messy unless there are
1461 * some changes to the data structures. Maybe later.
1462 */
1463 static void
1464 doemit(p, op, opnd)
1465 register struct parse *p;
1466 sop op;
1467 size_t opnd;
1468 {
1469 /* avoid making error situations worse */
1470 if (p->error != 0)
1471 return;
1472
1473 /* deal with oversize operands ("can't happen", more or less) */
1474 assert(opnd < 1<<OPSHIFT);
1475
1476 /* deal with undersized strip */
1477 if (p->slen >= p->ssize)
1478 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1479 assert(p->slen < p->ssize);
1480
1481 /* finally, it's all reduced to the easy case */
1482 p->strip[p->slen++] = SOP(op, opnd);
1483 }
1484
1485 /*
1486 - doinsert - insert a sop into the strip
1487 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1488 */
1489 static void
1490 doinsert(p, op, opnd, pos)
1491 register struct parse *p;
1492 sop op;
1493 size_t opnd;
1494 sopno pos;
1495 {
1496 register sopno sn;
1497 register sop s;
1498 register int i;
1499
1500 /* avoid making error situations worse */
1501 if (p->error != 0)
1502 return;
1503
1504 sn = HERE();
1505 EMIT(op, opnd); /* do checks, ensure space */
1506 assert(HERE() == sn+1);
1507 s = p->strip[sn];
1508
1509 /* adjust paren pointers */
1510 assert(pos > 0);
1511 for (i = 1; i < NPAREN; i++) {
1512 if (p->pbegin[i] >= pos) {
1513 p->pbegin[i]++;
1514 }
1515 if (p->pend[i] >= pos) {
1516 p->pend[i]++;
1517 }
1518 }
1519
1520 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1521 (HERE()-pos-1)*sizeof(sop));
1522 p->strip[pos] = s;
1523 }
1524
1525 /*
1526 - dofwd - complete a forward reference
1527 == static void dofwd(register struct parse *p, sopno pos, sop value);
1528 */
1529 static void
1530 dofwd(p, pos, value)
1531 register struct parse *p;
1532 register sopno pos;
1533 sop value;
1534 {
1535 /* avoid making error situations worse */
1536 if (p->error != 0)
1537 return;
1538
1539 assert(value < 1<<OPSHIFT);
1540 p->strip[pos] = OP(p->strip[pos]) | value;
1541 }
1542
1543 /*
1544 - enlarge - enlarge the strip
1545 == static void enlarge(register struct parse *p, sopno size);
1546 */
1547 static void
1548 enlarge(p, size)
1549 register struct parse *p;
1550 register sopno size;
1551 {
1552 register sop *sp;
1553
1554 if (p->ssize >= size)
1555 return;
1556
1557 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1558 if (sp == NULL) {
1559 SETERROR(REG_ESPACE);
1560 return;
1561 }
1562 p->strip = sp;
1563 p->ssize = size;
1564 }
1565
1566 /*
1567 - stripsnug - compact the strip
1568 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1569 */
1570 static void
1571 stripsnug(p, g)
1572 register struct parse *p;
1573 register struct re_guts *g;
1574 {
1575 g->nstates = p->slen;
1576 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1577 if (g->strip == NULL) {
1578 SETERROR(REG_ESPACE);
1579 g->strip = p->strip;
1580 }
1581 }
1582
1583 /*
1584 - findmust - fill in must and mlen with longest mandatory literal string
1585 == static void findmust(register struct parse *p, register struct re_guts *g);
1586 *
1587 * This algorithm could do fancy things like analyzing the operands of |
1588 * for common subsequences. Someday. This code is simple and finds most
1589 * of the interesting cases.
1590 *
1591 * Note that must and mlen got initialized during setup.
1592 */
1593 static void
1594 findmust(p, g)
1595 struct parse *p;
1596 register struct re_guts *g;
1597 {
1598 register sop *scan;
1599 sop *start;
1600 register sop *newstart;
1601 register sopno newlen;
1602 register sop s;
1603 register char *cp;
1604 register sopno i;
1605
1606 /* avoid making error situations worse */
1607 if (p->error != 0)
1608 return;
1609
1610 /* find the longest OCHAR sequence in strip */
1611 newlen = 0;
1612 scan = g->strip + 1;
1613 do {
1614 s = *scan++;
1615 switch (OP(s)) {
1616 case OCHAR: /* sequence member */
1617 if (newlen == 0) /* new sequence */
1618 newstart = scan - 1;
1619 newlen++;
1620 break;
1621 case OPLUS_: /* things that don't break one */
1622 case OLPAREN:
1623 case ORPAREN:
1624 break;
1625 case OQUEST_: /* things that must be skipped */
1626 case OCH_:
1627 scan--;
1628 do {
1629 scan += OPND(s);
1630 s = *scan;
1631 /* assert() interferes w debug printouts */
1632 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1633 OP(s) != OOR2) {
1634 g->iflags |= BAD;
1635 return;
1636 }
1637 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1638 /* fallthrough */
1639 default: /* things that break a sequence */
1640 if (newlen > g->mlen) { /* ends one */
1641 start = newstart;
1642 g->mlen = newlen;
1643 }
1644 newlen = 0;
1645 break;
1646 }
1647 } while (OP(s) != OEND);
1648
1649 if (g->mlen == 0) /* there isn't one */
1650 return;
1651
1652 /* turn it into a character string */
1653 g->must = malloc((size_t)g->mlen + 1);
1654 if (g->must == NULL) { /* argh; just forget it */
1655 g->mlen = 0;
1656 return;
1657 }
1658 cp = g->must;
1659 scan = start;
1660 for (i = g->mlen; i > 0; i--) {
1661 while (OP(s = *scan++) != OCHAR)
1662 continue;
1663 assert(cp < g->must + g->mlen);
1664 *cp++ = (char)OPND(s);
1665 }
1666 assert(cp == g->must + g->mlen);
1667 *cp++ = '\0'; /* just on general principles */
1668 }
1669
1670 /*
1671 - pluscount - count + nesting
1672 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1673 */
1674 static sopno /* nesting depth */
1675 pluscount(p, g)
1676 struct parse *p;
1677 register struct re_guts *g;
1678 {
1679 register sop *scan;
1680 register sop s;
1681 register sopno plusnest = 0;
1682 register sopno maxnest = 0;
1683
1684 if (p->error != 0)
1685 return(0); /* there may not be an OEND */
1686
1687 scan = g->strip + 1;
1688 do {
1689 s = *scan++;
1690 switch (OP(s)) {
1691 case OPLUS_:
1692 plusnest++;
1693 break;
1694 case O_PLUS:
1695 if (plusnest > maxnest)
1696 maxnest = plusnest;
1697 plusnest--;
1698 break;
1699 }
1700 } while (OP(s) != OEND);
1701 if (plusnest != 0)
1702 g->iflags |= BAD;
1703 return(maxnest);
1704 }
1705