compile.c revision 1.10 1 /*-
2 * Copyright (c) 1992 Diomidis Spinellis.
3 * Copyright (c) 1992, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Diomidis Spinellis of Imperial College, University of London.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38 #ifndef lint
39 /* from: static char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93"; */
40 static char *rcsid = "$Id: compile.c,v 1.10 1994/02/03 23:44:46 cgd Exp $";
41 #endif /* not lint */
42
43 #include <sys/types.h>
44 #include <sys/stat.h>
45
46 #include <ctype.h>
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <limits.h>
50 #include <regex.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54
55 #include "defs.h"
56 #include "extern.h"
57
58 #define LHSZ 128
59 #define LHMASK (LHSZ - 1)
60 static struct labhash {
61 struct labhash *lh_next;
62 u_int lh_hash;
63 struct s_command *lh_cmd;
64 int lh_ref;
65 } *labels[LHSZ];
66
67 static char *compile_addr __P((char *, struct s_addr *));
68 static char *compile_delimited __P((char *, char *));
69 static char *compile_flags __P((char *, struct s_subst *));
70 static char *compile_re __P((char *, regex_t **));
71 static char *compile_subst __P((char *, struct s_subst *));
72 static char *compile_text __P((void));
73 static char *compile_tr __P((char *, char **));
74 static struct s_command
75 **compile_stream __P((char *, struct s_command **, char *));
76 static char *duptoeol __P((char *, char *));
77 static void enterlabel __P((struct s_command *));
78 static struct s_command
79 *findlabel __P((char *));
80 static void fixuplabel __P((struct s_command *, struct s_command *));
81 static void uselabel __P((void));
82
83 /*
84 * Command specification. This is used to drive the command parser.
85 */
86 struct s_format {
87 char code; /* Command code */
88 int naddr; /* Number of address args */
89 enum e_args args; /* Argument type */
90 };
91
92 static struct s_format cmd_fmts[] = {
93 {'{', 2, GROUP},
94 {'a', 1, TEXT},
95 {'b', 2, BRANCH},
96 {'c', 2, TEXT},
97 {'d', 2, EMPTY},
98 {'D', 2, EMPTY},
99 {'g', 2, EMPTY},
100 {'G', 2, EMPTY},
101 {'h', 2, EMPTY},
102 {'H', 2, EMPTY},
103 {'i', 1, TEXT},
104 {'l', 2, EMPTY},
105 {'n', 2, EMPTY},
106 {'N', 2, EMPTY},
107 {'p', 2, EMPTY},
108 {'P', 2, EMPTY},
109 {'q', 1, EMPTY},
110 {'r', 1, RFILE},
111 {'s', 2, SUBST},
112 {'t', 2, BRANCH},
113 {'w', 2, WFILE},
114 {'x', 2, EMPTY},
115 {'y', 2, TR},
116 {'!', 2, NONSEL},
117 {':', 0, LABEL},
118 {'#', 0, COMMENT},
119 {'=', 1, EMPTY},
120 {'\0', 0, COMMENT},
121 };
122
123 /* The compiled program. */
124 struct s_command *prog;
125
126 /*
127 * Compile the program into prog.
128 * Initialise appends.
129 */
130 void
131 compile()
132 {
133 *compile_stream(NULL, &prog, NULL) = NULL;
134 fixuplabel(prog, NULL);
135 uselabel();
136 appends = xmalloc(sizeof(struct s_appends) * appendnum);
137 match = xmalloc((maxnsub + 1) * sizeof(regmatch_t));
138 }
139
140 #define EATSPACE() do { \
141 if (p) \
142 while (*p && isascii(*p) && isspace(*p)) \
143 p++; \
144 } while (0)
145
146 static struct s_command **
147 compile_stream(terminator, link, p)
148 char *terminator;
149 struct s_command **link;
150 register char *p;
151 {
152 static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */
153 struct s_command *cmd, *cmd2;
154 struct s_format *fp;
155 int naddr; /* Number of addresses */
156
157 if (p != NULL)
158 goto semicolon;
159 for (;;) {
160 if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) {
161 if (terminator != NULL)
162 err(COMPILE, "unexpected EOF (pending }'s)");
163 return (link);
164 }
165
166 semicolon: EATSPACE();
167 if (p && (*p == '#' || *p == '\0'))
168 continue;
169 if (*p == '}') {
170 if (terminator == NULL)
171 err(COMPILE, "unexpected }");
172 return (link);
173 }
174 *link = cmd = xmalloc(sizeof(struct s_command));
175 link = &cmd->next;
176 cmd->nonsel = cmd->inrange = 0;
177 /* First parse the addresses */
178 naddr = 0;
179 cmd->a1 = cmd->a2 = NULL;
180
181 /* Valid characters to start an address */
182 #define addrchar(c) (strchr("0123456789/\\$", (c)))
183 if (addrchar(*p)) {
184 naddr++;
185 cmd->a1 = xmalloc(sizeof(struct s_addr));
186 p = compile_addr(p, cmd->a1);
187 EATSPACE(); /* EXTENSION */
188 if (*p == ',') {
189 naddr++;
190 p++;
191 EATSPACE(); /* EXTENSION */
192 cmd->a2 = xmalloc(sizeof(struct s_addr));
193 p = compile_addr(p, cmd->a2);
194 }
195 }
196
197 nonsel: /* Now parse the command */
198 EATSPACE();
199 if (!*p)
200 err(COMPILE, "command expected");
201 cmd->code = *p;
202 for (fp = cmd_fmts; fp->code; fp++)
203 if (fp->code == *p)
204 break;
205 if (!fp->code)
206 err(COMPILE, "invalid command code %c", *p);
207 if (naddr > fp->naddr)
208 err(COMPILE,
209 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr);
210 switch (fp->args) {
211 case NONSEL: /* ! */
212 cmd->nonsel = ! cmd->nonsel;
213 p++;
214 goto nonsel;
215 case GROUP: /* { */
216 p++;
217 EATSPACE();
218 if (!*p)
219 p = NULL;
220 cmd2 = xmalloc(sizeof(struct s_command));
221 cmd2->code = '}';
222 *compile_stream("}", &cmd->u.c, p) = cmd2;
223 cmd->next = cmd2;
224 link = &cmd2->next;
225 break;
226 case EMPTY: /* d D g G h H l n N p P q x = \0 */
227 p++;
228 EATSPACE();
229 if (*p == ';') {
230 p++;
231 link = &cmd->next;
232 goto semicolon;
233 }
234 if (*p)
235 err(COMPILE,
236 "extra characters at the end of %c command", cmd->code);
237 break;
238 case TEXT: /* a c i */
239 p++;
240 EATSPACE();
241 if (*p != '\\')
242 err(COMPILE,
243 "command %c expects \\ followed by text", cmd->code);
244 p++;
245 EATSPACE();
246 if (*p)
247 err(COMPILE,
248 "extra characters after \\ at the end of %c command", cmd->code);
249 cmd->t = compile_text();
250 break;
251 case COMMENT: /* \0 # */
252 break;
253 case WFILE: /* w */
254 p++;
255 EATSPACE();
256 if (*p == '\0')
257 err(COMPILE, "filename expected");
258 cmd->t = duptoeol(p, "w command");
259 if (aflag)
260 cmd->u.fd = -1;
261 else if ((cmd->u.fd = open(p,
262 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
263 DEFFILEMODE)) == -1)
264 err(FATAL, "%s: %s\n", p, strerror(errno));
265 break;
266 case RFILE: /* r */
267 p++;
268 EATSPACE();
269 if (*p == '\0')
270 err(COMPILE, "filename expected");
271 else
272 cmd->t = duptoeol(p, "read command");
273 break;
274 case BRANCH: /* b t */
275 p++;
276 EATSPACE();
277 if (*p == '\0')
278 cmd->t = NULL;
279 else
280 cmd->t = duptoeol(p, "branch");
281 break;
282 case LABEL: /* : */
283 p++;
284 EATSPACE();
285 cmd->t = duptoeol(p, "label");
286 if (strlen(p) == 0)
287 err(COMPILE, "empty label");
288 enterlabel(cmd);
289 break;
290 case SUBST: /* s */
291 p++;
292 if (*p == '\0' || *p == '\\')
293 err(COMPILE,
294 "substitute pattern can not be delimited by newline or backslash");
295 cmd->u.s = xmalloc(sizeof(struct s_subst));
296 p = compile_re(p, &cmd->u.s->re);
297 if (p == NULL)
298 err(COMPILE, "unterminated substitute pattern");
299 --p;
300 p = compile_subst(p, cmd->u.s);
301 p = compile_flags(p, cmd->u.s);
302 EATSPACE();
303 if (*p == ';') {
304 p++;
305 link = &cmd->next;
306 goto semicolon;
307 }
308 break;
309 case TR: /* y */
310 p++;
311 p = compile_tr(p, (char **)&cmd->u.y);
312 EATSPACE();
313 if (*p == ';') {
314 p++;
315 link = &cmd->next;
316 goto semicolon;
317 }
318 if (*p)
319 err(COMPILE,
320 "extra text at the end of a transform command");
321 break;
322 }
323 }
324 }
325
326 /*
327 * Get a delimited string. P points to the delimeter of the string; d points
328 * to a buffer area. Newline and delimiter escapes are processed; other
329 * escapes are ignored.
330 *
331 * Returns a pointer to the first character after the final delimiter or NULL
332 * in the case of a non-terminated string. The character array d is filled
333 * with the processed string.
334 */
335 static char *
336 compile_delimited(p, d)
337 char *p, *d;
338 {
339 char c;
340
341 c = *p++;
342 if (c == '\0')
343 return (NULL);
344 else if (c == '\\')
345 err(COMPILE, "\\ can not be used as a string delimiter");
346 else if (c == '\n')
347 err(COMPILE, "newline can not be used as a string delimiter");
348 while (*p) {
349 if (*p == '\\' && p[1] == c)
350 p++;
351 else if (*p == '\\' && p[1] == 'n') {
352 *d++ = '\n';
353 p += 2;
354 continue;
355 } else if (*p == '\\' && p[1] == '\\')
356 *d++ = *p++;
357 else if (*p == c) {
358 *d = '\0';
359 return (p + 1);
360 }
361 *d++ = *p++;
362 }
363 return (NULL);
364 }
365
366 /*
367 * Get a regular expression. P points to the delimiter of the regular
368 * expression; repp points to the address of a regexp pointer. Newline
369 * and delimiter escapes are processed; other escapes are ignored.
370 * Returns a pointer to the first character after the final delimiter
371 * or NULL in the case of a non terminated regular expression. The regexp
372 * pointer is set to the compiled regular expression.
373 * Cflags are passed to regcomp.
374 */
375 static char *
376 compile_re(p, repp)
377 char *p;
378 regex_t **repp;
379 {
380 int eval;
381 char re[_POSIX2_LINE_MAX + 1];
382
383 p = compile_delimited(p, re);
384 if (p && strlen(re) == 0) {
385 *repp = NULL;
386 return (p);
387 }
388 *repp = xmalloc(sizeof(regex_t));
389 if (p && (eval = regcomp(*repp, re, 0)) != 0)
390 err(COMPILE, "RE error: %s", strregerror(eval, *repp));
391 if (maxnsub < (*repp)->re_nsub)
392 maxnsub = (*repp)->re_nsub;
393 return (p);
394 }
395
396 /*
397 * Compile the substitution string of a regular expression and set res to
398 * point to a saved copy of it. Nsub is the number of parenthesized regular
399 * expressions.
400 */
401 static char *
402 compile_subst(p, s)
403 char *p;
404 struct s_subst *s;
405 {
406 static char lbuf[_POSIX2_LINE_MAX + 1];
407 int asize, ref, size;
408 char c, *text, *op, *sp;
409
410 c = *p++; /* Terminator character */
411 if (c == '\0')
412 return (NULL);
413
414 s->maxbref = 0;
415 s->linenum = linenum;
416 asize = 2 * _POSIX2_LINE_MAX + 1;
417 text = xmalloc(asize);
418 size = 0;
419 do {
420 op = sp = text + size;
421 for (; *p; p++) {
422 if (*p == '\\') {
423 p++;
424 if (strchr("123456789", *p) != NULL) {
425 *sp++ = '\\';
426 ref = *p - '0';
427 if (s->re != NULL &&
428 ref > s->re->re_nsub)
429 err(COMPILE,
430 "\\%c not defined in the RE", *p);
431 if (s->maxbref < ref)
432 s->maxbref = ref;
433 } else if (*p == '&' || *p == '\\')
434 *sp++ = '\\';
435 } else if (*p == c) {
436 p++;
437 *sp++ = '\0';
438 size += sp - op;
439 s->new = xrealloc(text, size);
440 return (p);
441 } else if (*p == '\n') {
442 err(COMPILE,
443 "unescaped newline inside substitute pattern");
444 /* NOTREACHED */
445 }
446 *sp++ = *p;
447 }
448 size += sp - op;
449 if (asize - size < _POSIX2_LINE_MAX + 1) {
450 asize *= 2;
451 text = xmalloc(asize);
452 }
453 } while (cu_fgets(p = lbuf, sizeof(lbuf)));
454 err(COMPILE, "unterminated substitute in regular expression");
455 /* NOTREACHED */
456 }
457
458 /*
459 * Compile the flags of the s command
460 */
461 static char *
462 compile_flags(p, s)
463 char *p;
464 struct s_subst *s;
465 {
466 int gn; /* True if we have seen g or n */
467 char wfile[_POSIX2_LINE_MAX + 1], *q;
468
469 s->n = 1; /* Default */
470 s->p = 0;
471 s->wfile = NULL;
472 s->wfd = -1;
473 for (gn = 0;;) {
474 EATSPACE(); /* EXTENSION */
475 switch (*p) {
476 case 'g':
477 if (gn)
478 err(COMPILE,
479 "more than one number or 'g' in substitute flags");
480 gn = 1;
481 s->n = 0;
482 break;
483 case '\0':
484 case '\n':
485 case ';':
486 return (p);
487 case 'p':
488 s->p = 1;
489 break;
490 case '1': case '2': case '3':
491 case '4': case '5': case '6':
492 case '7': case '8': case '9':
493 if (gn)
494 err(COMPILE,
495 "more than one number or 'g' in substitute flags");
496 gn = 1;
497 /* XXX Check for overflow */
498 s->n = (int)strtol(p, &p, 10);
499 break;
500 case 'w':
501 p++;
502 #ifdef HISTORIC_PRACTICE
503 if (*p != ' ') {
504 err(WARNING, "space missing before w wfile");
505 return (p);
506 }
507 #endif
508 EATSPACE();
509 q = wfile;
510 while (*p) {
511 if (*p == '\n')
512 break;
513 *q++ = *p++;
514 }
515 *q = '\0';
516 if (q == wfile)
517 err(COMPILE, "no wfile specified");
518 s->wfile = strdup(wfile);
519 if (!aflag && (s->wfd = open(wfile,
520 O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
521 DEFFILEMODE)) == -1)
522 err(FATAL, "%s: %s\n", wfile, strerror(errno));
523 return (p);
524 default:
525 err(COMPILE,
526 "bad flag in substitute command: '%c'", *p);
527 break;
528 }
529 p++;
530 }
531 }
532
533 /*
534 * Compile a translation set of strings into a lookup table.
535 */
536 static char *
537 compile_tr(p, transtab)
538 char *p;
539 char **transtab;
540 {
541 int i;
542 char *lt, *op, *np;
543 char old[_POSIX2_LINE_MAX + 1];
544 char new[_POSIX2_LINE_MAX + 1];
545
546 if (*p == '\0' || *p == '\\')
547 err(COMPILE,
548 "transform pattern can not be delimited by newline or backslash");
549 p = compile_delimited(p, old);
550 if (p == NULL) {
551 err(COMPILE, "unterminated transform source string");
552 return (NULL);
553 }
554 p = compile_delimited(--p, new);
555 if (p == NULL) {
556 err(COMPILE, "unterminated transform target string");
557 return (NULL);
558 }
559 EATSPACE();
560 if (strlen(new) != strlen(old)) {
561 err(COMPILE, "transform strings are not the same length");
562 return (NULL);
563 }
564 /* We assume characters are 8 bits */
565 lt = xmalloc(UCHAR_MAX);
566 for (i = 0; i <= UCHAR_MAX; i++)
567 lt[i] = (char)i;
568 for (op = old, np = new; *op; op++, np++)
569 lt[(u_char)*op] = *np;
570 *transtab = lt;
571 return (p);
572 }
573
574 /*
575 * Compile the text following an a or i command.
576 */
577 static char *
578 compile_text()
579 {
580 int asize, size;
581 char *text, *p, *op, *s;
582 char lbuf[_POSIX2_LINE_MAX + 1];
583
584 asize = 2 * _POSIX2_LINE_MAX + 1;
585 text = xmalloc(asize);
586 size = 0;
587 while (cu_fgets(lbuf, sizeof(lbuf))) {
588 op = s = text + size;
589 p = lbuf;
590 EATSPACE();
591 for (; *p; p++) {
592 if (*p == '\\')
593 p++;
594 *s++ = *p;
595 }
596 size += s - op;
597 if (p[-2] != '\\') {
598 *s = '\0';
599 break;
600 }
601 if (asize - size < _POSIX2_LINE_MAX + 1) {
602 asize *= 2;
603 text = xmalloc(asize);
604 }
605 }
606 return (xrealloc(text, size + 1));
607 }
608
609 /*
610 * Get an address and return a pointer to the first character after
611 * it. Fill the structure pointed to according to the address.
612 */
613 static char *
614 compile_addr(p, a)
615 char *p;
616 struct s_addr *a;
617 {
618 char *end;
619
620 switch (*p) {
621 case '\\': /* Context address */
622 ++p;
623 /* FALLTHROUGH */
624 case '/': /* Context address */
625 p = compile_re(p, &a->u.r);
626 if (p == NULL)
627 err(COMPILE, "unterminated regular expression");
628 a->type = AT_RE;
629 return (p);
630
631 case '$': /* Last line */
632 a->type = AT_LAST;
633 return (p + 1);
634 /* Line number */
635 case '0': case '1': case '2': case '3': case '4':
636 case '5': case '6': case '7': case '8': case '9':
637 a->type = AT_LINE;
638 a->u.l = strtol(p, &end, 10);
639 return (end);
640 default:
641 err(COMPILE, "expected context address");
642 return (NULL);
643 }
644 }
645
646 /*
647 * duptoeol --
648 * Return a copy of all the characters up to \n or \0.
649 */
650 static char *
651 duptoeol(s, ctype)
652 register char *s;
653 char *ctype;
654 {
655 size_t len;
656 int ws;
657 char *start;
658
659 ws = 0;
660 for (start = s; *s != '\0' && *s != '\n'; ++s)
661 ws = isspace(*s);
662 *s = '\0';
663 if (ws)
664 err(WARNING, "whitespace after %s", ctype);
665 len = s - start + 1;
666 return (memmove(xmalloc(len), start, len));
667 }
668
669 /*
670 * Convert goto label names to addresses, and count a and r commands, in
671 * the given subset of the script. Free the memory used by labels in b
672 * and t commands (but not by :).
673 *
674 * TODO: Remove } nodes
675 */
676 static void
677 fixuplabel(cp, end)
678 struct s_command *cp, *end;
679 {
680
681 for (; cp != end; cp = cp->next)
682 switch (cp->code) {
683 case 'a':
684 case 'r':
685 appendnum++;
686 break;
687 case 'b':
688 case 't':
689 /* Resolve branch target. */
690 if (cp->t == NULL) {
691 cp->u.c = NULL;
692 break;
693 }
694 if ((cp->u.c = findlabel(cp->t)) == NULL)
695 err(COMPILE2, "undefined label '%s'", cp->t);
696 free(cp->t);
697 break;
698 case '{':
699 /* Do interior commands. */
700 fixuplabel(cp->u.c, cp->next);
701 break;
702 }
703 }
704
705 /*
706 * Associate the given command label for later lookup.
707 */
708 static void
709 enterlabel(cp)
710 struct s_command *cp;
711 {
712 register struct labhash **lhp, *lh;
713 register u_char *p;
714 register u_int h, c;
715
716 for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++)
717 h = (h << 5) + h + c;
718 lhp = &labels[h & LHMASK];
719 for (lh = *lhp; lh != NULL; lh = lh->lh_next)
720 if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0)
721 err(COMPILE2, "duplicate label '%s'", cp->t);
722 lh = xmalloc(sizeof *lh);
723 lh->lh_next = *lhp;
724 lh->lh_hash = h;
725 lh->lh_cmd = cp;
726 lh->lh_ref = 0;
727 *lhp = lh;
728 }
729
730 /*
731 * Find the label contained in the command l in the command linked
732 * list cp. L is excluded from the search. Return NULL if not found.
733 */
734 static struct s_command *
735 findlabel(name)
736 char *name;
737 {
738 register struct labhash *lh;
739 register u_char *p;
740 register u_int h, c;
741
742 for (h = 0, p = (u_char *)name; (c = *p) != 0; p++)
743 h = (h << 5) + h + c;
744 for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) {
745 if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) {
746 lh->lh_ref = 1;
747 return (lh->lh_cmd);
748 }
749 }
750 return (NULL);
751 }
752
753 /*
754 * Warn about any unused labels. As a side effect, release the label hash
755 * table space.
756 */
757 static void
758 uselabel()
759 {
760 register struct labhash *lh, *next;
761 register int i;
762
763 for (i = 0; i < LHSZ; i++) {
764 for (lh = labels[i]; lh != NULL; lh = next) {
765 next = lh->lh_next;
766 if (!lh->lh_ref)
767 err(WARNING, "unused label '%s'",
768 lh->lh_cmd->t);
769 free(lh);
770 }
771 }
772 }
773