regexp.c revision 1.12 1 /* $NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 #include <sys/cdefs.h>
34 #ifndef lint
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
36 The Regents of the University of California. All rights reserved.");
37 #endif /* not lint */
38
39 #ifndef lint
40 #if 0
41 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
42 #endif
43 __RCSID("$NetBSD: regexp.c,v 1.12 2012/03/20 20:34:59 matt Exp $");
44 #endif /* not lint */
45
46 #include <assert.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "extern.h"
51
52 #define FALSE 0
53 #define TRUE !(FALSE)
54 #define NIL 0
55
56 static void expconv __P((void));
57
58 boolean x_escaped; /* true if we are currently x_escaped */
59 char *x_start; /* start of string */
60 boolean l_onecase; /* true if upper and lower equivalent */
61
62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
63
64 /* STRNCMP - like strncmp except that we convert the
65 * first string to lower case before comparing
66 * if l_onecase is set.
67 */
68
69 int
70 STRNCMP(char *s1, char *s2, int len)
71 {
72 if (l_onecase) {
73 do
74 if (*s2 - makelower(*s1))
75 return (*s2 - makelower(*s1));
76 else {
77 s2++;
78 s1++;
79 }
80 while (--len);
81 } else {
82 do
83 if (*s2 - *s1)
84 return (*s2 - *s1);
85 else {
86 s2++;
87 s1++;
88 }
89 while (--len);
90 }
91 return(0);
92 }
93
94 /* The following routine converts an irregular expression to
95 * internal format.
96 *
97 * Either meta symbols (\a \d or \p) or character strings or
98 * operations ( alternation or perenthesizing ) can be
99 * specified. Each starts with a descriptor byte. The descriptor
100 * byte has STR set for strings, META set for meta symbols
101 * and OPER set for operations.
102 * The descriptor byte can also have the OPT bit set if the object
103 * defined is optional. Also ALT can be set to indicate an alternation.
104 *
105 * For metasymbols the byte following the descriptor byte identities
106 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
107 * strings the byte after the descriptor is a character count for
108 * the string:
109 *
110 * meta symbols := descriptor
111 * symbol
112 *
113 * strings := descriptor
114 * character count
115 * the string
116 *
117 * operatins := descriptor
118 * symbol
119 * character count
120 */
121
122 /*
123 * handy macros for accessing parts of match blocks
124 */
125 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
126 #define MNEXT(A) (A+2) /* character following a metasymbol block */
127
128 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
129 #define OCNT(A) (*(A+2)) /* character count */
130 #define ONEXT(A) (A+3) /* next character after the operation */
131 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
132
133 #define SCNT(A) (*(A+1)) /* byte count of a string */
134 #define SSTR(A) (A+2) /* address of the string */
135 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
136
137 /*
138 * bit flags in the descriptor
139 */
140 #define OPT 1
141 #define STR 2
142 #define META 4
143 #define ALT 8
144 #define OPER 16
145
146 static char *ccre; /* pointer to current position in converted exp*/
147 static char *ure; /* pointer current position in unconverted exp */
148
149 char *
150 convexp(char *re) /* unconverted irregular expression */
151 {
152 char *cre; /* pointer to converted regular expression */
153
154 /* allocate room for the converted expression */
155 if (re == NIL)
156 return (NIL);
157 if (*re == '\0')
158 return (NIL);
159 cre = malloc(4 * strlen(re) + 3);
160 ccre = cre;
161 ure = re;
162
163 /* start the conversion with a \a */
164 *cre = META | OPT;
165 MSYM(cre) = 'a';
166 ccre = MNEXT(cre);
167
168 /* start the conversion (its recursive) */
169 expconv();
170 *ccre = 0;
171 return (cre);
172 }
173
174 static void
175 expconv(void)
176 {
177 char *cs; /* pointer to current symbol in converted exp */
178 char c; /* character being processed */
179 char *acs; /* pinter to last alternate */
180 int temp;
181
182 /* let the conversion begin */
183 acs = NIL;
184 cs = NIL;
185 while (*ure != NIL) {
186 switch (c = *ure++) {
187
188 case '\\':
189 switch (c = *ure++) {
190
191 /* escaped characters are just characters */
192 default:
193 if (cs == NIL || (*cs & STR) == 0) {
194 cs = ccre;
195 *cs = STR;
196 SCNT(cs) = 1;
197 ccre += 2;
198 } else
199 SCNT(cs)++;
200 *ccre++ = c;
201 break;
202
203 /* normal(?) metacharacters */
204 case 'a':
205 case 'd':
206 case 'e':
207 case 'p':
208 if (acs != NIL && acs != cs) {
209 do {
210 temp = OCNT(acs);
211 OCNT(acs) = ccre - acs;
212 acs -= temp;
213 } while (temp != 0);
214 acs = NIL;
215 }
216 cs = ccre;
217 *cs = META;
218 MSYM(cs) = c;
219 ccre = MNEXT(cs);
220 break;
221 }
222 break;
223
224 /* just put the symbol in */
225 case '^':
226 case '$':
227 if (acs != NIL && acs != cs) {
228 do {
229 temp = OCNT(acs);
230 OCNT(acs) = ccre - acs;
231 acs -= temp;
232 } while (temp != 0);
233 acs = NIL;
234 }
235 cs = ccre;
236 *cs = META;
237 MSYM(cs) = c;
238 ccre = MNEXT(cs);
239 break;
240
241 /* mark the last match sequence as optional */
242 case '?':
243 if (cs)
244 *cs = *cs | OPT;
245 break;
246
247 /* recurse and define a subexpression */
248 case '(':
249 if (acs != NIL && acs != cs) {
250 do {
251 temp = OCNT(acs);
252 OCNT(acs) = ccre - acs;
253 acs -= temp;
254 } while (temp != 0);
255 acs = NIL;
256 }
257 cs = ccre;
258 *cs = OPER;
259 OSYM(cs) = '(';
260 ccre = ONEXT(cs);
261 expconv();
262 OCNT(cs) = ccre - cs; /* offset to next symbol */
263 break;
264
265 /* reurn from a recursion */
266 case ')':
267 if (acs != NIL) {
268 do {
269 temp = OCNT(acs);
270 OCNT(acs) = ccre - acs;
271 acs -= temp;
272 } while (temp != 0);
273 acs = NIL;
274 }
275 cs = ccre;
276 *cs = META;
277 MSYM(cs) = c;
278 ccre = MNEXT(cs);
279 return;
280
281 /* mark the last match sequence as having an alternate */
282 /* the third byte will contain an offset to jump over the */
283 /* alternate match in case the first did not fail */
284 case '|':
285 if (acs != NIL && acs != cs)
286 OCNT(ccre) = ccre - acs; /* make a back pointer */
287 else
288 OCNT(ccre) = 0;
289 assert(cs != NULL);
290 *cs |= ALT;
291 cs = ccre;
292 *cs = OPER;
293 OSYM(cs) = '|';
294 ccre = ONEXT(cs);
295 acs = cs; /* remember that the pointer is to be filles */
296 break;
297
298 /* if its not a metasymbol just build a scharacter string */
299 default:
300 if (cs == NIL || (*cs & STR) == 0) {
301 cs = ccre;
302 *cs = STR;
303 SCNT(cs) = 1;
304 ccre = SSTR(cs);
305 } else
306 SCNT(cs)++;
307 *ccre++ = c;
308 break;
309 }
310 }
311 if (acs != NIL) {
312 do {
313 temp = OCNT(acs);
314 OCNT(acs) = ccre - acs;
315 acs -= temp;
316 } while (temp != 0);
317 acs = NIL;
318 }
319 return;
320 }
321 /* end of convertre */
322
323
324 /*
325 * The following routine recognises an irregular expresion
326 * with the following special characters:
327 *
328 * \? - means last match was optional
329 * \a - matches any number of characters
330 * \d - matches any number of spaces and tabs
331 * \p - matches any number of alphanumeric
332 * characters. The
333 * characters matched will be copied into
334 * the area pointed to by 'name'.
335 * \| - alternation
336 * \( \) - grouping used mostly for alternation and
337 * optionality
338 *
339 * The irregular expression must be translated to internal form
340 * prior to calling this routine
341 *
342 * The value returned is the pointer to the first non \a
343 * character matched.
344 */
345
346 char *
347 expmatch(
348 char *s, /* string to check for a match in */
349 char *re, /* a converted irregular expression */
350 char *mstring) /* where to put whatever matches a \p */
351 {
352 char *cs; /* the current symbol */
353 char *ptr,*s1; /* temporary pointer */
354 boolean matched; /* a temporary boolean */
355
356 /* initial conditions */
357 if (re == NIL)
358 return (NIL);
359 cs = re;
360 matched = FALSE;
361
362 /* loop till expression string is exhausted (or at least pretty tired) */
363 while (*cs) {
364 switch (*cs & (OPER | STR | META)) {
365
366 /* try to match a string */
367 case STR:
368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369 if (matched) {
370
371 /* hoorah it matches */
372 s += SCNT(cs);
373 cs = SNEXT(cs);
374 } else if (*cs & ALT) {
375
376 /* alternation, skip to next expression */
377 cs = SNEXT(cs);
378 } else if (*cs & OPT) {
379
380 /* the match is optional */
381 cs = SNEXT(cs);
382 matched = 1; /* indicate a successful match */
383 } else {
384
385 /* no match, error return */
386 return (NIL);
387 }
388 break;
389
390 /* an operator, do something fancy */
391 case OPER:
392 switch (OSYM(cs)) {
393
394 /* this is an alternation */
395 case '|':
396 if (matched)
397
398 /* last thing in the alternation was a match, skip ahead */
399 cs = OPTR(cs);
400 else
401
402 /* no match, keep trying */
403 cs = ONEXT(cs);
404 break;
405
406 /* this is a grouping, recurse */
407 case '(':
408 ptr = expmatch(s, ONEXT(cs), mstring);
409 if (ptr != NIL) {
410
411 /* the subexpression matched */
412 matched = 1;
413 s = ptr;
414 } else if (*cs & ALT) {
415
416 /* alternation, skip to next expression */
417 matched = 0;
418 } else if (*cs & OPT) {
419
420 /* the match is optional */
421 matched = 1; /* indicate a successful match */
422 } else {
423
424 /* no match, error return */
425 return (NIL);
426 }
427 cs = OPTR(cs);
428 break;
429 }
430 break;
431
432 /* try to match a metasymbol */
433 case META:
434 switch (MSYM(cs)) {
435
436 /* try to match anything and remember what was matched */
437 case 'p':
438 /*
439 * This is really the same as trying the match the
440 * remaining parts of the expression to any subset
441 * of the string.
442 */
443 s1 = s;
444 do {
445 ptr = expmatch(s1, MNEXT(cs), mstring);
446 if (ptr != NIL && s1 != s) {
447
448 /* we have a match, remember the match */
449 strncpy(mstring, s, s1 - s);
450 mstring[s1 - s] = '\0';
451 return (ptr);
452 } else if (ptr != NIL && (*cs & OPT)) {
453
454 /* it was aoptional so no match is ok */
455 return (ptr);
456 } else if (ptr != NIL) {
457
458 /* not optional and we still matched */
459 return (NIL);
460 }
461 if (!isalnum((unsigned char)*s1) && *s1 != '_')
462 return (NIL);
463 if (*s1 == '\\')
464 x_escaped = x_escaped ? FALSE : TRUE;
465 else
466 x_escaped = FALSE;
467 } while (*s1++);
468 return (NIL);
469
470 /* try to match anything */
471 case 'a':
472 /*
473 * This is really the same as trying the match the
474 * remaining parts of the expression to any subset
475 * of the string.
476 */
477 s1 = s;
478 do {
479 ptr = expmatch(s1, MNEXT(cs), mstring);
480 if (ptr != NIL && s1 != s) {
481
482 /* we have a match */
483 return (ptr);
484 } else if (ptr != NIL && (*cs & OPT)) {
485
486 /* it was aoptional so no match is ok */
487 return (ptr);
488 } else if (ptr != NIL) {
489
490 /* not optional and we still matched */
491 return (NIL);
492 }
493 if (*s1 == '\\')
494 x_escaped = x_escaped ? FALSE : TRUE;
495 else
496 x_escaped = FALSE;
497 } while (*s1++);
498 return (NIL);
499
500 /* fail if we are currently x_escaped */
501 case 'e':
502 if (x_escaped)
503 return(NIL);
504 cs = MNEXT(cs);
505 break;
506
507 /* match any number of tabs and spaces */
508 case 'd':
509 ptr = s;
510 while (*s == ' ' || *s == '\t')
511 s++;
512 if (s != ptr || s == x_start) {
513
514 /* match, be happy */
515 matched = 1;
516 cs = MNEXT(cs);
517 } else if (*s == '\n' || *s == '\0') {
518
519 /* match, be happy */
520 matched = 1;
521 cs = MNEXT(cs);
522 } else if (*cs & ALT) {
523
524 /* try the next part */
525 matched = 0;
526 cs = MNEXT(cs);
527 } else if (*cs & OPT) {
528
529 /* doesn't matter */
530 matched = 1;
531 cs = MNEXT(cs);
532 } else
533
534 /* no match, error return */
535 return (NIL);
536 break;
537
538 /* check for end of line */
539 case '$':
540 if (*s == '\0' || *s == '\n') {
541
542 /* match, be happy */
543 s++;
544 matched = 1;
545 cs = MNEXT(cs);
546 } else if (*cs & ALT) {
547
548 /* try the next part */
549 matched = 0;
550 cs = MNEXT(cs);
551 } else if (*cs & OPT) {
552
553 /* doesn't matter */
554 matched = 1;
555 cs = MNEXT(cs);
556 } else
557
558 /* no match, error return */
559 return (NIL);
560 break;
561
562 /* check for start of line */
563 case '^':
564 if (s == x_start) {
565
566 /* match, be happy */
567 matched = 1;
568 cs = MNEXT(cs);
569 } else if (*cs & ALT) {
570
571 /* try the next part */
572 matched = 0;
573 cs = MNEXT(cs);
574 } else if (*cs & OPT) {
575
576 /* doesn't matter */
577 matched = 1;
578 cs = MNEXT(cs);
579 } else
580
581 /* no match, error return */
582 return (NIL);
583 break;
584
585 /* end of a subexpression, return success */
586 case ')':
587 return (s);
588 }
589 break;
590 }
591 }
592 return (s);
593 }
594